Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,6 @@
|
|
| 2 |
# import json
|
| 3 |
# import hashlib
|
| 4 |
# import shutil
|
| 5 |
-
# from io import BytesIO
|
| 6 |
# from typing import List, Tuple
|
| 7 |
|
| 8 |
# import gradio as gr
|
|
@@ -15,13 +14,22 @@
|
|
| 15 |
# # ---------------- Config ----------------
|
| 16 |
# OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
| 17 |
# OPENROUTER_MODEL = "nvidia/nemotron-nano-12b-v2-vl:free"
|
| 18 |
-
# EMBEDDING_MODEL_NAME = "paraphrase-MiniLM-L3-v2"
|
| 19 |
# CACHE_DIR = "./cache"
|
| 20 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# os.makedirs(CACHE_DIR, exist_ok=True)
|
| 23 |
|
| 24 |
-
# # Lazy loaded
|
| 25 |
# embedder = None
|
| 26 |
|
| 27 |
# def get_embedder():
|
|
@@ -32,45 +40,77 @@
|
|
| 32 |
# print("Embedder loaded.")
|
| 33 |
# return embedder
|
| 34 |
|
| 35 |
-
#
|
| 36 |
-
#
|
|
|
|
|
|
|
| 37 |
# EMBEDDINGS: np.ndarray = None
|
| 38 |
# FAISS_INDEX = None
|
| 39 |
-
#
|
| 40 |
|
| 41 |
|
| 42 |
-
# # ---------------- Cache cleanup
|
| 43 |
# def clear_old_cache():
|
| 44 |
# try:
|
| 45 |
# if os.path.exists(CACHE_DIR):
|
| 46 |
# shutil.rmtree(CACHE_DIR)
|
| 47 |
# os.makedirs(CACHE_DIR, exist_ok=True)
|
| 48 |
-
# print("Cache cleared.")
|
| 49 |
# except Exception as e:
|
| 50 |
# print(f"[Cache cleanup error] {e}")
|
| 51 |
|
| 52 |
|
| 53 |
-
# # ---------------- PDF extraction ----------------
|
| 54 |
-
# def
|
|
|
|
| 55 |
# try:
|
| 56 |
# doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 57 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
# except Exception as e:
|
| 59 |
-
# return f"[PDF extraction error] {e}"
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
# # ----------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
# def make_cache_key(files: List[Tuple[str, bytes]]) -> str:
|
| 64 |
# h = hashlib.sha256()
|
| 65 |
# for name, b in sorted(files, key=lambda x: x[0]):
|
| 66 |
# h.update(name.encode())
|
| 67 |
-
# h.update(str(len(b)).encode())
|
| 68 |
# h.update(hashlib.sha256(b).digest())
|
| 69 |
# return h.hexdigest()
|
| 70 |
|
| 71 |
-
# def cache_save(cache_key: str, embeddings: np.ndarray,
|
| 72 |
-
#
|
| 73 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# def cache_load(cache_key: str):
|
| 76 |
# path = os.path.join(CACHE_DIR, f"{cache_key}.npz")
|
|
@@ -78,72 +118,74 @@
|
|
| 78 |
# return None
|
| 79 |
# try:
|
| 80 |
# data = np.load(path, allow_pickle=True)
|
| 81 |
-
# return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
# except:
|
| 83 |
# return None
|
| 84 |
|
|
|
|
|
|
|
| 85 |
# def build_faiss(emb: np.ndarray):
|
| 86 |
# global FAISS_INDEX
|
| 87 |
# if emb is None or len(emb) == 0:
|
| 88 |
# FAISS_INDEX = None
|
| 89 |
-
# return
|
| 90 |
# emb = emb.astype("float32")
|
| 91 |
# index = faiss.IndexFlatL2(emb.shape[1])
|
| 92 |
# index.add(emb)
|
| 93 |
# FAISS_INDEX = index
|
| 94 |
-
# return index
|
| 95 |
|
| 96 |
-
# def search(query: str, k: int =
|
| 97 |
-
# if FAISS_INDEX is None:
|
| 98 |
# return []
|
| 99 |
# q_emb = get_embedder().encode([query], convert_to_numpy=True).astype("float32")
|
| 100 |
# D, I = FAISS_INDEX.search(q_emb, k)
|
| 101 |
-
#
|
| 102 |
-
#
|
| 103 |
-
#
|
| 104 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
|
| 107 |
# # ---------------- OpenRouter API ----------------
|
| 108 |
-
# def call_openrouter(
|
| 109 |
# if not OPENROUTER_API_KEY:
|
| 110 |
-
# return "
|
| 111 |
|
| 112 |
# url = "https://openrouter.ai/api/v1/chat/completions"
|
| 113 |
# headers = {
|
| 114 |
# "Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 115 |
# "Content-Type": "application/json",
|
| 116 |
# }
|
| 117 |
-
|
| 118 |
# payload = {
|
| 119 |
# "model": OPENROUTER_MODEL,
|
| 120 |
-
# "messages": [
|
| 121 |
-
# {"role": "system",
|
| 122 |
-
# "content": SYSTEM_PROMPT + " Always respond in plain text. Avoid markdown."},
|
| 123 |
-
# {"role": "user", "content": prompt},
|
| 124 |
-
# ],
|
| 125 |
# }
|
| 126 |
|
| 127 |
# try:
|
| 128 |
# r = requests.post(url, headers=headers, json=payload, timeout=60)
|
| 129 |
# r.raise_for_status()
|
| 130 |
# obj = r.json()
|
| 131 |
-
|
| 132 |
# if "choices" in obj and obj["choices"]:
|
| 133 |
-
#
|
| 134 |
-
#
|
| 135 |
-
# return "[Unexpected OpenRouter response]"
|
| 136 |
# except Exception as e:
|
| 137 |
-
# return f"[OpenRouter
|
| 138 |
|
| 139 |
|
| 140 |
-
# # ----------
|
| 141 |
# def read_file_bytes(f) -> Tuple[str, bytes]:
|
| 142 |
-
# # tuple (name, bytes)
|
| 143 |
# if isinstance(f, tuple) and len(f) == 2 and isinstance(f[1], (bytes, bytearray)):
|
| 144 |
# return f[0], bytes(f[1])
|
| 145 |
-
|
| 146 |
-
# # dict-like
|
| 147 |
# if isinstance(f, dict):
|
| 148 |
# name = f.get("name") or f.get("filename") or "uploaded"
|
| 149 |
# data = f.get("data") or f.get("content") or f.get("value") or f.get("file")
|
|
@@ -158,16 +200,12 @@
|
|
| 158 |
# if tmp_path and isinstance(tmp_path, str) and os.path.exists(tmp_path):
|
| 159 |
# with open(tmp_path, "rb") as fh:
|
| 160 |
# return os.path.basename(tmp_path), fh.read()
|
| 161 |
-
|
| 162 |
-
# # file-like object with read()
|
| 163 |
# if hasattr(f, "name") and hasattr(f, "read"):
|
| 164 |
# try:
|
| 165 |
# name = os.path.basename(f.name) if getattr(f, "name", None) else "uploaded"
|
| 166 |
# return name, f.read()
|
| 167 |
# except Exception:
|
| 168 |
# pass
|
| 169 |
-
|
| 170 |
-
# # NamedString-like: has .name and .value
|
| 171 |
# if hasattr(f, "name") and hasattr(f, "value"):
|
| 172 |
# name = os.path.basename(getattr(f, "name") or "uploaded")
|
| 173 |
# v = getattr(f, "value")
|
|
@@ -175,23 +213,19 @@
|
|
| 175 |
# return name, bytes(v)
|
| 176 |
# if isinstance(v, str):
|
| 177 |
# return name, v.encode("utf-8")
|
| 178 |
-
|
| 179 |
-
# # string path
|
| 180 |
# if isinstance(f, str) and os.path.exists(f):
|
| 181 |
# with open(f, "rb") as fh:
|
| 182 |
# return os.path.basename(f), fh.read()
|
| 183 |
-
|
| 184 |
# raise ValueError(f"Unsupported file object type: {type(f)}")
|
| 185 |
|
| 186 |
|
| 187 |
-
# # ----------------
|
| 188 |
# def upload_and_index(files):
|
| 189 |
-
# global
|
| 190 |
|
| 191 |
# if not files:
|
| 192 |
-
# return "No
|
| 193 |
|
| 194 |
-
# # Clear old cache on every new upload to free up space
|
| 195 |
# clear_old_cache()
|
| 196 |
|
| 197 |
# processed = []
|
|
@@ -203,77 +237,300 @@
|
|
| 203 |
# name, b = read_file_bytes(f)
|
| 204 |
# processed.append((name, b))
|
| 205 |
# except ValueError as e:
|
| 206 |
-
# return f"Upload error: {e}", ""
|
| 207 |
-
|
| 208 |
-
# preview = [{"name": n, "size": len(b)} for n, b in processed]
|
| 209 |
|
| 210 |
# cache_key = make_cache_key(processed)
|
| 211 |
-
# CURRENT_CACHE_KEY = cache_key
|
| 212 |
-
|
| 213 |
# cached = cache_load(cache_key)
|
|
|
|
| 214 |
# if cached:
|
| 215 |
-
# EMBEDDINGS,
|
| 216 |
# EMBEDDINGS = np.array(EMBEDDINGS)
|
| 217 |
-
# DOCS = [extract_text_from_pdf(b) for _, b in processed]
|
| 218 |
# build_faiss(EMBEDDINGS)
|
| 219 |
-
#
|
| 220 |
-
|
| 221 |
-
#
|
| 222 |
-
#
|
| 223 |
-
#
|
| 224 |
-
|
| 225 |
-
#
|
| 226 |
-
#
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
# build_faiss(EMBEDDINGS)
|
| 229 |
|
| 230 |
-
# return
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
# # ---------------- Question Answering ----------------
|
| 234 |
-
# def ask(question: str):
|
| 235 |
-
# if not question:
|
| 236 |
-
# return "Please enter a question."
|
| 237 |
-
# if not DOCS:
|
| 238 |
-
# return "No PDFs indexed. Please upload a PDF first."
|
| 239 |
-
|
| 240 |
-
# results = search(question)
|
| 241 |
-
|
| 242 |
-
# if not results:
|
| 243 |
-
# return "No relevant text found in the uploaded PDFs."
|
| 244 |
-
|
| 245 |
-
# context = "\n".join(
|
| 246 |
-
# f"Source: {r['source']}\n\n{r['text'][:15000]}\n---\n"
|
| 247 |
-
# for r in results
|
| 248 |
# )
|
| 249 |
|
| 250 |
-
#
|
| 251 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
|
| 254 |
# # ---------------- Gradio UI ----------------
|
| 255 |
-
# with gr.Blocks(
|
| 256 |
-
#
|
| 257 |
-
|
| 258 |
-
#
|
| 259 |
-
#
|
| 260 |
-
#
|
| 261 |
-
#
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
#
|
| 266 |
-
#
|
| 267 |
-
#
|
| 268 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
-
# ask_btn.click(ask, inputs=[q], outputs=[answer])
|
| 271 |
|
| 272 |
# if __name__ == "__main__":
|
| 273 |
# demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
|
| 274 |
|
| 275 |
import os
|
| 276 |
-
import json
|
| 277 |
import hashlib
|
| 278 |
import shutil
|
| 279 |
from typing import List, Tuple
|
|
@@ -290,31 +547,27 @@ OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
|
| 290 |
OPENROUTER_MODEL = "nvidia/nemotron-nano-12b-v2-vl:free"
|
| 291 |
EMBEDDING_MODEL_NAME = "paraphrase-MiniLM-L3-v2"
|
| 292 |
CACHE_DIR = "./cache"
|
| 293 |
-
CHUNK_SIZE = 300
|
| 294 |
-
CHUNK_OVERLAP = 50
|
| 295 |
-
TOP_K = 4
|
| 296 |
|
| 297 |
SYSTEM_PROMPT = (
|
| 298 |
"You are an expert document assistant. "
|
| 299 |
"Answer questions using ONLY the provided context from the uploaded PDFs. "
|
| 300 |
-
"Be concise, accurate, and
|
| 301 |
"Always respond in plain text. Avoid markdown formatting."
|
| 302 |
)
|
| 303 |
|
| 304 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 305 |
|
| 306 |
-
# Lazy loaded to avoid OOM on HF Spaces
|
| 307 |
embedder = None
|
| 308 |
|
| 309 |
def get_embedder():
|
| 310 |
global embedder
|
| 311 |
if embedder is None:
|
| 312 |
-
print("Loading embedder model...")
|
| 313 |
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
|
| 314 |
-
print("Embedder loaded.")
|
| 315 |
return embedder
|
| 316 |
|
| 317 |
-
# Global state
|
| 318 |
CHUNKS: List[str] = []
|
| 319 |
CHUNK_SOURCES: List[str] = []
|
| 320 |
CHUNK_PAGES: List[int] = []
|
|
@@ -323,19 +576,16 @@ FAISS_INDEX = None
|
|
| 323 |
INDEXED_FILES: List[dict] = []
|
| 324 |
|
| 325 |
|
| 326 |
-
# ---------------- Cache cleanup ----------------
|
| 327 |
def clear_old_cache():
|
| 328 |
try:
|
| 329 |
if os.path.exists(CACHE_DIR):
|
| 330 |
shutil.rmtree(CACHE_DIR)
|
| 331 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 332 |
except Exception as e:
|
| 333 |
-
print(f"[Cache
|
| 334 |
|
| 335 |
|
| 336 |
-
# ---------------- PDF extraction with page tracking ----------------
|
| 337 |
def extract_pages_from_pdf(file_bytes: bytes) -> List[Tuple[int, str]]:
|
| 338 |
-
"""Returns list of (page_number, page_text)"""
|
| 339 |
try:
|
| 340 |
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 341 |
pages = []
|
|
@@ -345,30 +595,22 @@ def extract_pages_from_pdf(file_bytes: bytes) -> List[Tuple[int, str]]:
|
|
| 345 |
pages.append((i + 1, text))
|
| 346 |
return pages
|
| 347 |
except Exception as e:
|
| 348 |
-
return [(0, f"[PDF
|
| 349 |
|
| 350 |
|
| 351 |
-
|
| 352 |
-
def chunk_text(text: str, source: str, page: int,
|
| 353 |
-
chunk_size: int = CHUNK_SIZE,
|
| 354 |
-
overlap: int = CHUNK_OVERLAP) -> List[Tuple[str, str, int]]:
|
| 355 |
-
"""
|
| 356 |
-
Splits text into overlapping word-level chunks.
|
| 357 |
-
Returns list of (chunk_text, source, page)
|
| 358 |
-
"""
|
| 359 |
words = text.split()
|
| 360 |
chunks = []
|
| 361 |
-
step =
|
| 362 |
for i in range(0, len(words), step):
|
| 363 |
-
chunk = " ".join(words[i: i +
|
| 364 |
if len(chunk.strip()) > 50:
|
| 365 |
chunks.append((chunk, source, page))
|
| 366 |
-
if i +
|
| 367 |
break
|
| 368 |
return chunks
|
| 369 |
|
| 370 |
|
| 371 |
-
# ---------------- Cache helpers ----------------
|
| 372 |
def make_cache_key(files: List[Tuple[str, bytes]]) -> str:
|
| 373 |
h = hashlib.sha256()
|
| 374 |
for name, b in sorted(files, key=lambda x: x[0]):
|
|
@@ -376,8 +618,7 @@ def make_cache_key(files: List[Tuple[str, bytes]]) -> str:
|
|
| 376 |
h.update(hashlib.sha256(b).digest())
|
| 377 |
return h.hexdigest()
|
| 378 |
|
| 379 |
-
def cache_save(cache_key
|
| 380 |
-
chunks: List[str], sources: List[str], pages: List[int]):
|
| 381 |
np.savez_compressed(
|
| 382 |
os.path.join(CACHE_DIR, f"{cache_key}.npz"),
|
| 383 |
embeddings=embeddings,
|
|
@@ -386,24 +627,18 @@ def cache_save(cache_key: str, embeddings: np.ndarray,
|
|
| 386 |
pages=np.array(pages),
|
| 387 |
)
|
| 388 |
|
| 389 |
-
def cache_load(cache_key
|
| 390 |
path = os.path.join(CACHE_DIR, f"{cache_key}.npz")
|
| 391 |
if not os.path.exists(path):
|
| 392 |
return None
|
| 393 |
try:
|
| 394 |
data = np.load(path, allow_pickle=True)
|
| 395 |
-
return (
|
| 396 |
-
data["embeddings"],
|
| 397 |
-
data["chunks"].tolist(),
|
| 398 |
-
data["sources"].tolist(),
|
| 399 |
-
data["pages"].tolist(),
|
| 400 |
-
)
|
| 401 |
except:
|
| 402 |
return None
|
| 403 |
|
| 404 |
|
| 405 |
-
|
| 406 |
-
def build_faiss(emb: np.ndarray):
|
| 407 |
global FAISS_INDEX
|
| 408 |
if emb is None or len(emb) == 0:
|
| 409 |
FAISS_INDEX = None
|
|
@@ -413,50 +648,35 @@ def build_faiss(emb: np.ndarray):
|
|
| 413 |
index.add(emb)
|
| 414 |
FAISS_INDEX = index
|
| 415 |
|
| 416 |
-
def search(query: str
|
| 417 |
if FAISS_INDEX is None or not CHUNKS:
|
| 418 |
return []
|
| 419 |
q_emb = get_embedder().encode([query], convert_to_numpy=True).astype("float32")
|
| 420 |
-
D, I = FAISS_INDEX.search(q_emb,
|
| 421 |
results = []
|
| 422 |
for d, i in zip(D[0], I[0]):
|
| 423 |
-
if
|
| 424 |
-
results.append({
|
| 425 |
-
"text": CHUNKS[i],
|
| 426 |
-
"source": CHUNK_SOURCES[i],
|
| 427 |
-
"page": CHUNK_PAGES[i],
|
| 428 |
-
"distance": float(d),
|
| 429 |
-
})
|
| 430 |
return results
|
| 431 |
|
| 432 |
|
| 433 |
-
# ---------------- OpenRouter API ----------------
|
| 434 |
def call_openrouter(messages: list) -> str:
|
| 435 |
if not OPENROUTER_API_KEY:
|
| 436 |
-
return "Error: OPENROUTER_API_KEY
|
| 437 |
-
|
| 438 |
url = "https://openrouter.ai/api/v1/chat/completions"
|
| 439 |
-
headers = {
|
| 440 |
-
|
| 441 |
-
"Content-Type": "application/json",
|
| 442 |
-
}
|
| 443 |
-
payload = {
|
| 444 |
-
"model": OPENROUTER_MODEL,
|
| 445 |
-
"messages": [{"role": "system", "content": SYSTEM_PROMPT}] + messages,
|
| 446 |
-
}
|
| 447 |
-
|
| 448 |
try:
|
| 449 |
r = requests.post(url, headers=headers, json=payload, timeout=60)
|
| 450 |
r.raise_for_status()
|
| 451 |
obj = r.json()
|
| 452 |
if "choices" in obj and obj["choices"]:
|
| 453 |
return obj["choices"][0]["message"]["content"].strip().replace("```", "")
|
| 454 |
-
return "[Unexpected
|
| 455 |
except Exception as e:
|
| 456 |
return f"[OpenRouter error] {e}"
|
| 457 |
|
| 458 |
|
| 459 |
-
# ---------------- File bytes reader ----------------
|
| 460 |
def read_file_bytes(f) -> Tuple[str, bytes]:
|
| 461 |
if isinstance(f, tuple) and len(f) == 2 and isinstance(f[1], (bytes, bytearray)):
|
| 462 |
return f[0], bytes(f[1])
|
|
@@ -466,42 +686,34 @@ def read_file_bytes(f) -> Tuple[str, bytes]:
|
|
| 466 |
if isinstance(data, (bytes, bytearray)):
|
| 467 |
return name, bytes(data)
|
| 468 |
if isinstance(data, str):
|
| 469 |
-
try:
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
with open(tmp_path, "rb") as fh:
|
| 476 |
-
return os.path.basename(tmp_path), fh.read()
|
| 477 |
if hasattr(f, "name") and hasattr(f, "read"):
|
| 478 |
try:
|
| 479 |
name = os.path.basename(f.name) if getattr(f, "name", None) else "uploaded"
|
| 480 |
return name, f.read()
|
| 481 |
-
except
|
| 482 |
-
pass
|
| 483 |
if hasattr(f, "name") and hasattr(f, "value"):
|
| 484 |
name = os.path.basename(getattr(f, "name") or "uploaded")
|
| 485 |
v = getattr(f, "value")
|
| 486 |
-
if isinstance(v, (bytes, bytearray)):
|
| 487 |
-
|
| 488 |
-
if isinstance(v, str):
|
| 489 |
-
return name, v.encode("utf-8")
|
| 490 |
if isinstance(f, str) and os.path.exists(f):
|
| 491 |
with open(f, "rb") as fh:
|
| 492 |
return os.path.basename(f), fh.read()
|
| 493 |
-
raise ValueError(f"Unsupported file
|
| 494 |
|
| 495 |
|
| 496 |
-
# ---------------- Upload & Index ----------------
|
| 497 |
def upload_and_index(files):
|
| 498 |
global CHUNKS, CHUNK_SOURCES, CHUNK_PAGES, EMBEDDINGS, INDEXED_FILES
|
| 499 |
-
|
| 500 |
if not files:
|
| 501 |
-
return "
|
| 502 |
|
| 503 |
clear_old_cache()
|
| 504 |
-
|
| 505 |
processed = []
|
| 506 |
if not isinstance(files, (list, tuple)):
|
| 507 |
files = [files]
|
|
@@ -511,7 +723,7 @@ def upload_and_index(files):
|
|
| 511 |
name, b = read_file_bytes(f)
|
| 512 |
processed.append((name, b))
|
| 513 |
except ValueError as e:
|
| 514 |
-
return f"Upload error: {e}",
|
| 515 |
|
| 516 |
cache_key = make_cache_key(processed)
|
| 517 |
cached = cache_load(cache_key)
|
|
@@ -522,8 +734,8 @@ def upload_and_index(files):
|
|
| 522 |
build_faiss(EMBEDDINGS)
|
| 523 |
INDEXED_FILES = [{"name": n, "size_kb": round(len(b)/1024, 1)} for n, b in processed]
|
| 524 |
return (
|
| 525 |
-
f"Loaded from cache β {len(CHUNKS)} chunks across {len(processed)} PDF(s).",
|
| 526 |
-
|
| 527 |
)
|
| 528 |
|
| 529 |
all_chunks, all_sources, all_pages = [], [], []
|
|
@@ -538,82 +750,86 @@ def upload_and_index(files):
|
|
| 538 |
all_sources.append(src)
|
| 539 |
all_pages.append(pg)
|
| 540 |
file_chunks += 1
|
| 541 |
-
INDEXED_FILES.append({
|
| 542 |
-
"name": name,
|
| 543 |
-
"size_kb": round(len(b) / 1024, 1),
|
| 544 |
-
"pages": len(pages),
|
| 545 |
-
"chunks": file_chunks,
|
| 546 |
-
})
|
| 547 |
|
| 548 |
CHUNKS = all_chunks
|
| 549 |
CHUNK_SOURCES = all_sources
|
| 550 |
CHUNK_PAGES = all_pages
|
| 551 |
|
| 552 |
if not CHUNKS:
|
| 553 |
-
return "Could not extract
|
| 554 |
|
| 555 |
EMBEDDINGS = get_embedder().encode(CHUNKS, convert_to_numpy=True).astype("float32")
|
| 556 |
cache_save(cache_key, EMBEDDINGS, CHUNKS, CHUNK_SOURCES, CHUNK_PAGES)
|
| 557 |
build_faiss(EMBEDDINGS)
|
| 558 |
|
| 559 |
return (
|
| 560 |
-
f"
|
| 561 |
-
|
| 562 |
)
|
| 563 |
|
| 564 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
if not files:
|
| 566 |
-
return "No files indexed yet
|
| 567 |
-
|
| 568 |
for f in files:
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 576 |
|
| 577 |
|
| 578 |
-
# ---------------- Chat ----------------
|
| 579 |
def chat(message: str, history: list):
|
| 580 |
if not message.strip():
|
| 581 |
return "", history
|
| 582 |
-
|
| 583 |
if not CHUNKS:
|
| 584 |
-
history.append((message, "
|
| 585 |
return "", history
|
| 586 |
|
| 587 |
results = search(message)
|
| 588 |
if not results:
|
| 589 |
-
history.append((message, "No relevant content found in the uploaded PDFs."))
|
| 590 |
return "", history
|
| 591 |
|
| 592 |
context_parts = []
|
| 593 |
sources_used = []
|
| 594 |
for r in results:
|
| 595 |
context_parts.append(f"[From: {r['source']}, Page {r['page']}]\n{r['text']}")
|
| 596 |
-
|
| 597 |
-
if
|
| 598 |
-
sources_used.append(
|
| 599 |
|
| 600 |
context = "\n\n---\n\n".join(context_parts)
|
| 601 |
-
|
| 602 |
-
# Multi-turn: include last 4 exchanges
|
| 603 |
messages = []
|
| 604 |
-
for
|
| 605 |
-
messages.append({"role": "user", "content":
|
| 606 |
-
messages.append({"role": "assistant", "content":
|
| 607 |
-
|
| 608 |
-
messages.append({
|
| 609 |
-
"role": "user",
|
| 610 |
-
"content": f"Context from PDFs:\n\n{context}\n\nQuestion: {message}"
|
| 611 |
-
})
|
| 612 |
|
| 613 |
answer = call_openrouter(messages)
|
| 614 |
-
|
| 615 |
if sources_used:
|
| 616 |
-
answer += f"\n\
|
| 617 |
|
| 618 |
history.append((message, answer))
|
| 619 |
return "", history
|
|
@@ -623,170 +839,381 @@ def clear_chat():
|
|
| 623 |
return []
|
| 624 |
|
| 625 |
|
| 626 |
-
#
|
| 627 |
-
|
| 628 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 629 |
|
| 630 |
:root {
|
| 631 |
-
--
|
| 632 |
-
--
|
| 633 |
-
--
|
| 634 |
-
--
|
| 635 |
-
--
|
| 636 |
-
--
|
| 637 |
-
--
|
| 638 |
-
--
|
|
|
|
|
|
|
| 639 |
}
|
| 640 |
|
| 641 |
-
|
| 642 |
-
background: var(--bg) !important;
|
| 643 |
-
font-family: 'DM Mono', monospace !important;
|
| 644 |
-
color: var(--text) !important;
|
| 645 |
-
}
|
| 646 |
|
| 647 |
.gradio-container {
|
| 648 |
-
|
|
|
|
| 649 |
margin: 0 auto !important;
|
|
|
|
|
|
|
| 650 |
}
|
| 651 |
|
| 652 |
-
|
|
|
|
|
|
|
| 653 |
text-align: center;
|
| 654 |
-
padding:
|
| 655 |
-
border-bottom:
|
| 656 |
-
|
| 657 |
}
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
-webkit-background-clip: text;
|
| 665 |
-
-webkit-text-fill-color: transparent;
|
| 666 |
-
background-clip: text;
|
| 667 |
-
margin: 0 0 6px;
|
| 668 |
-
letter-spacing: -1px;
|
| 669 |
}
|
| 670 |
-
|
| 671 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
color: var(--muted);
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 676 |
}
|
| 677 |
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
text-transform: uppercase;
|
| 684 |
-
color: var(--
|
| 685 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 687 |
|
| 688 |
-
|
| 689 |
-
background:
|
|
|
|
| 690 |
border: 1px solid var(--border) !important;
|
|
|
|
|
|
|
| 691 |
border-radius: 8px !important;
|
| 692 |
-
|
| 693 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 694 |
font-size: 0.87rem !important;
|
|
|
|
|
|
|
|
|
|
| 695 |
}
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
|
|
|
|
|
|
|
|
|
| 699 |
}
|
| 700 |
|
| 701 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 702 |
text-align: center;
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
font-
|
| 706 |
-
font-
|
| 707 |
-
|
|
|
|
|
|
|
| 708 |
}
|
| 709 |
-
"""
|
| 710 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 711 |
|
| 712 |
-
#
|
|
|
|
|
|
|
| 713 |
with gr.Blocks(
|
| 714 |
title="PDF RAG Bot",
|
| 715 |
-
css=
|
| 716 |
theme=gr.themes.Base(
|
| 717 |
-
primary_hue="
|
| 718 |
-
neutral_hue="
|
| 719 |
-
)
|
| 720 |
) as demo:
|
| 721 |
|
|
|
|
| 722 |
gr.HTML("""
|
| 723 |
-
<div class="
|
| 724 |
-
|
| 725 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
</div>
|
| 727 |
""")
|
| 728 |
|
| 729 |
-
|
|
|
|
| 730 |
|
| 731 |
-
#
|
| 732 |
-
with gr.Column(scale=
|
| 733 |
-
gr.HTML('<div class="
|
| 734 |
|
| 735 |
file_input = gr.File(
|
| 736 |
-
label="Drop
|
| 737 |
file_count="multiple",
|
| 738 |
file_types=[".pdf"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 739 |
)
|
| 740 |
-
upload_btn = gr.Button("β‘ Upload & Index", variant="primary", size="lg")
|
| 741 |
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
interactive=False,
|
| 745 |
-
lines=2,
|
| 746 |
)
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
)
|
| 753 |
|
| 754 |
-
#
|
| 755 |
-
with gr.Column(scale=
|
| 756 |
-
gr.HTML('<div class="
|
| 757 |
|
| 758 |
chatbot = gr.Chatbot(
|
| 759 |
label="",
|
| 760 |
-
height=
|
| 761 |
bubble_full_width=False,
|
| 762 |
show_label=False,
|
| 763 |
-
|
|
|
|
| 764 |
)
|
| 765 |
|
| 766 |
-
with gr.Row():
|
| 767 |
question = gr.Textbox(
|
| 768 |
label="",
|
| 769 |
-
placeholder="
|
| 770 |
lines=2,
|
| 771 |
scale=5,
|
| 772 |
show_label=False,
|
|
|
|
| 773 |
)
|
| 774 |
-
with gr.Column(scale=
|
| 775 |
-
send_btn = gr.Button("
|
| 776 |
-
clear_btn = gr.Button("Clear", variant="secondary")
|
| 777 |
|
|
|
|
| 778 |
gr.HTML("""
|
| 779 |
-
<div class="
|
| 780 |
-
Powered by OpenRouter Β·
|
| 781 |
-
|
| 782 |
</div>
|
| 783 |
""")
|
| 784 |
|
| 785 |
-
# Events
|
| 786 |
upload_btn.click(
|
| 787 |
upload_and_index,
|
| 788 |
inputs=[file_input],
|
| 789 |
-
outputs=[
|
| 790 |
)
|
| 791 |
send_btn.click(
|
| 792 |
chat,
|
|
|
|
| 2 |
# import json
|
| 3 |
# import hashlib
|
| 4 |
# import shutil
|
|
|
|
| 5 |
# from typing import List, Tuple
|
| 6 |
|
| 7 |
# import gradio as gr
|
|
|
|
| 14 |
# # ---------------- Config ----------------
|
| 15 |
# OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
|
| 16 |
# OPENROUTER_MODEL = "nvidia/nemotron-nano-12b-v2-vl:free"
|
| 17 |
+
# EMBEDDING_MODEL_NAME = "paraphrase-MiniLM-L3-v2"
|
| 18 |
# CACHE_DIR = "./cache"
|
| 19 |
+
# CHUNK_SIZE = 300 # words per chunk
|
| 20 |
+
# CHUNK_OVERLAP = 50 # overlapping words between chunks
|
| 21 |
+
# TOP_K = 4 # number of chunks to retrieve
|
| 22 |
+
|
| 23 |
+
# SYSTEM_PROMPT = (
|
| 24 |
+
# "You are an expert document assistant. "
|
| 25 |
+
# "Answer questions using ONLY the provided context from the uploaded PDFs. "
|
| 26 |
+
# "Be concise, accurate, and cite which document your answer comes from. "
|
| 27 |
+
# "Always respond in plain text. Avoid markdown formatting."
|
| 28 |
+
# )
|
| 29 |
|
| 30 |
# os.makedirs(CACHE_DIR, exist_ok=True)
|
| 31 |
|
| 32 |
+
# # Lazy loaded to avoid OOM on HF Spaces
|
| 33 |
# embedder = None
|
| 34 |
|
| 35 |
# def get_embedder():
|
|
|
|
| 40 |
# print("Embedder loaded.")
|
| 41 |
# return embedder
|
| 42 |
|
| 43 |
+
# # Global state
|
| 44 |
+
# CHUNKS: List[str] = []
|
| 45 |
+
# CHUNK_SOURCES: List[str] = []
|
| 46 |
+
# CHUNK_PAGES: List[int] = []
|
| 47 |
# EMBEDDINGS: np.ndarray = None
|
| 48 |
# FAISS_INDEX = None
|
| 49 |
+
# INDEXED_FILES: List[dict] = []
|
| 50 |
|
| 51 |
|
| 52 |
+
# # ---------------- Cache cleanup ----------------
|
| 53 |
# def clear_old_cache():
|
| 54 |
# try:
|
| 55 |
# if os.path.exists(CACHE_DIR):
|
| 56 |
# shutil.rmtree(CACHE_DIR)
|
| 57 |
# os.makedirs(CACHE_DIR, exist_ok=True)
|
|
|
|
| 58 |
# except Exception as e:
|
| 59 |
# print(f"[Cache cleanup error] {e}")
|
| 60 |
|
| 61 |
|
| 62 |
+
# # ---------------- PDF extraction with page tracking ----------------
|
| 63 |
+
# def extract_pages_from_pdf(file_bytes: bytes) -> List[Tuple[int, str]]:
|
| 64 |
+
# """Returns list of (page_number, page_text)"""
|
| 65 |
# try:
|
| 66 |
# doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 67 |
+
# pages = []
|
| 68 |
+
# for i, page in enumerate(doc):
|
| 69 |
+
# text = page.get_text().strip()
|
| 70 |
+
# if text:
|
| 71 |
+
# pages.append((i + 1, text))
|
| 72 |
+
# return pages
|
| 73 |
# except Exception as e:
|
| 74 |
+
# return [(0, f"[PDF extraction error] {e}")]
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# # ---------------- Chunking strategy ----------------
|
| 78 |
+
# def chunk_text(text: str, source: str, page: int,
|
| 79 |
+
# chunk_size: int = CHUNK_SIZE,
|
| 80 |
+
# overlap: int = CHUNK_OVERLAP) -> List[Tuple[str, str, int]]:
|
| 81 |
+
# """
|
| 82 |
+
# Splits text into overlapping word-level chunks.
|
| 83 |
+
# Returns list of (chunk_text, source, page)
|
| 84 |
+
# """
|
| 85 |
+
# words = text.split()
|
| 86 |
+
# chunks = []
|
| 87 |
+
# step = chunk_size - overlap
|
| 88 |
+
# for i in range(0, len(words), step):
|
| 89 |
+
# chunk = " ".join(words[i: i + chunk_size])
|
| 90 |
+
# if len(chunk.strip()) > 50:
|
| 91 |
+
# chunks.append((chunk, source, page))
|
| 92 |
+
# if i + chunk_size >= len(words):
|
| 93 |
+
# break
|
| 94 |
+
# return chunks
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# # ---------------- Cache helpers ----------------
|
| 98 |
# def make_cache_key(files: List[Tuple[str, bytes]]) -> str:
|
| 99 |
# h = hashlib.sha256()
|
| 100 |
# for name, b in sorted(files, key=lambda x: x[0]):
|
| 101 |
# h.update(name.encode())
|
|
|
|
| 102 |
# h.update(hashlib.sha256(b).digest())
|
| 103 |
# return h.hexdigest()
|
| 104 |
|
| 105 |
+
# def cache_save(cache_key: str, embeddings: np.ndarray,
|
| 106 |
+
# chunks: List[str], sources: List[str], pages: List[int]):
|
| 107 |
+
# np.savez_compressed(
|
| 108 |
+
# os.path.join(CACHE_DIR, f"{cache_key}.npz"),
|
| 109 |
+
# embeddings=embeddings,
|
| 110 |
+
# chunks=np.array(chunks),
|
| 111 |
+
# sources=np.array(sources),
|
| 112 |
+
# pages=np.array(pages),
|
| 113 |
+
# )
|
| 114 |
|
| 115 |
# def cache_load(cache_key: str):
|
| 116 |
# path = os.path.join(CACHE_DIR, f"{cache_key}.npz")
|
|
|
|
| 118 |
# return None
|
| 119 |
# try:
|
| 120 |
# data = np.load(path, allow_pickle=True)
|
| 121 |
+
# return (
|
| 122 |
+
# data["embeddings"],
|
| 123 |
+
# data["chunks"].tolist(),
|
| 124 |
+
# data["sources"].tolist(),
|
| 125 |
+
# data["pages"].tolist(),
|
| 126 |
+
# )
|
| 127 |
# except:
|
| 128 |
# return None
|
| 129 |
|
| 130 |
+
|
| 131 |
+
# # ---------------- FAISS ----------------
|
| 132 |
# def build_faiss(emb: np.ndarray):
|
| 133 |
# global FAISS_INDEX
|
| 134 |
# if emb is None or len(emb) == 0:
|
| 135 |
# FAISS_INDEX = None
|
| 136 |
+
# return
|
| 137 |
# emb = emb.astype("float32")
|
| 138 |
# index = faiss.IndexFlatL2(emb.shape[1])
|
| 139 |
# index.add(emb)
|
| 140 |
# FAISS_INDEX = index
|
|
|
|
| 141 |
|
| 142 |
+
# def search(query: str, k: int = TOP_K):
|
| 143 |
+
# if FAISS_INDEX is None or not CHUNKS:
|
| 144 |
# return []
|
| 145 |
# q_emb = get_embedder().encode([query], convert_to_numpy=True).astype("float32")
|
| 146 |
# D, I = FAISS_INDEX.search(q_emb, k)
|
| 147 |
+
# results = []
|
| 148 |
+
# for d, i in zip(D[0], I[0]):
|
| 149 |
+
# if i >= 0 and i < len(CHUNKS):
|
| 150 |
+
# results.append({
|
| 151 |
+
# "text": CHUNKS[i],
|
| 152 |
+
# "source": CHUNK_SOURCES[i],
|
| 153 |
+
# "page": CHUNK_PAGES[i],
|
| 154 |
+
# "distance": float(d),
|
| 155 |
+
# })
|
| 156 |
+
# return results
|
| 157 |
|
| 158 |
|
| 159 |
# # ---------------- OpenRouter API ----------------
|
| 160 |
+
# def call_openrouter(messages: list) -> str:
|
| 161 |
# if not OPENROUTER_API_KEY:
|
| 162 |
+
# return "Error: OPENROUTER_API_KEY is not set. Please add it in HF Space secrets."
|
| 163 |
|
| 164 |
# url = "https://openrouter.ai/api/v1/chat/completions"
|
| 165 |
# headers = {
|
| 166 |
# "Authorization": f"Bearer {OPENROUTER_API_KEY}",
|
| 167 |
# "Content-Type": "application/json",
|
| 168 |
# }
|
|
|
|
| 169 |
# payload = {
|
| 170 |
# "model": OPENROUTER_MODEL,
|
| 171 |
+
# "messages": [{"role": "system", "content": SYSTEM_PROMPT}] + messages,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
# }
|
| 173 |
|
| 174 |
# try:
|
| 175 |
# r = requests.post(url, headers=headers, json=payload, timeout=60)
|
| 176 |
# r.raise_for_status()
|
| 177 |
# obj = r.json()
|
|
|
|
| 178 |
# if "choices" in obj and obj["choices"]:
|
| 179 |
+
# return obj["choices"][0]["message"]["content"].strip().replace("```", "")
|
| 180 |
+
# return "[Unexpected response from API]"
|
|
|
|
| 181 |
# except Exception as e:
|
| 182 |
+
# return f"[OpenRouter error] {e}"
|
| 183 |
|
| 184 |
|
| 185 |
+
# # ---------------- File bytes reader ----------------
|
| 186 |
# def read_file_bytes(f) -> Tuple[str, bytes]:
|
|
|
|
| 187 |
# if isinstance(f, tuple) and len(f) == 2 and isinstance(f[1], (bytes, bytearray)):
|
| 188 |
# return f[0], bytes(f[1])
|
|
|
|
|
|
|
| 189 |
# if isinstance(f, dict):
|
| 190 |
# name = f.get("name") or f.get("filename") or "uploaded"
|
| 191 |
# data = f.get("data") or f.get("content") or f.get("value") or f.get("file")
|
|
|
|
| 200 |
# if tmp_path and isinstance(tmp_path, str) and os.path.exists(tmp_path):
|
| 201 |
# with open(tmp_path, "rb") as fh:
|
| 202 |
# return os.path.basename(tmp_path), fh.read()
|
|
|
|
|
|
|
| 203 |
# if hasattr(f, "name") and hasattr(f, "read"):
|
| 204 |
# try:
|
| 205 |
# name = os.path.basename(f.name) if getattr(f, "name", None) else "uploaded"
|
| 206 |
# return name, f.read()
|
| 207 |
# except Exception:
|
| 208 |
# pass
|
|
|
|
|
|
|
| 209 |
# if hasattr(f, "name") and hasattr(f, "value"):
|
| 210 |
# name = os.path.basename(getattr(f, "name") or "uploaded")
|
| 211 |
# v = getattr(f, "value")
|
|
|
|
| 213 |
# return name, bytes(v)
|
| 214 |
# if isinstance(v, str):
|
| 215 |
# return name, v.encode("utf-8")
|
|
|
|
|
|
|
| 216 |
# if isinstance(f, str) and os.path.exists(f):
|
| 217 |
# with open(f, "rb") as fh:
|
| 218 |
# return os.path.basename(f), fh.read()
|
|
|
|
| 219 |
# raise ValueError(f"Unsupported file object type: {type(f)}")
|
| 220 |
|
| 221 |
|
| 222 |
+
# # ---------------- Upload & Index ----------------
|
| 223 |
# def upload_and_index(files):
|
| 224 |
+
# global CHUNKS, CHUNK_SOURCES, CHUNK_PAGES, EMBEDDINGS, INDEXED_FILES
|
| 225 |
|
| 226 |
# if not files:
|
| 227 |
+
# return "No files uploaded.", "No files indexed yet."
|
| 228 |
|
|
|
|
| 229 |
# clear_old_cache()
|
| 230 |
|
| 231 |
# processed = []
|
|
|
|
| 237 |
# name, b = read_file_bytes(f)
|
| 238 |
# processed.append((name, b))
|
| 239 |
# except ValueError as e:
|
| 240 |
+
# return f"Upload error: {e}", "No files indexed yet."
|
|
|
|
|
|
|
| 241 |
|
| 242 |
# cache_key = make_cache_key(processed)
|
|
|
|
|
|
|
| 243 |
# cached = cache_load(cache_key)
|
| 244 |
+
|
| 245 |
# if cached:
|
| 246 |
+
# EMBEDDINGS, CHUNKS, CHUNK_SOURCES, CHUNK_PAGES = cached
|
| 247 |
# EMBEDDINGS = np.array(EMBEDDINGS)
|
|
|
|
| 248 |
# build_faiss(EMBEDDINGS)
|
| 249 |
+
# INDEXED_FILES = [{"name": n, "size_kb": round(len(b)/1024, 1)} for n, b in processed]
|
| 250 |
+
# return (
|
| 251 |
+
# f"Loaded from cache β {len(CHUNKS)} chunks across {len(processed)} PDF(s).",
|
| 252 |
+
# _render_file_list(INDEXED_FILES)
|
| 253 |
+
# )
|
| 254 |
+
|
| 255 |
+
# all_chunks, all_sources, all_pages = [], [], []
|
| 256 |
+
# INDEXED_FILES = []
|
| 257 |
+
|
| 258 |
+
# for name, b in processed:
|
| 259 |
+
# pages = extract_pages_from_pdf(b)
|
| 260 |
+
# file_chunks = 0
|
| 261 |
+
# for page_num, page_text in pages:
|
| 262 |
+
# for chunk, src, pg in chunk_text(page_text, name, page_num):
|
| 263 |
+
# all_chunks.append(chunk)
|
| 264 |
+
# all_sources.append(src)
|
| 265 |
+
# all_pages.append(pg)
|
| 266 |
+
# file_chunks += 1
|
| 267 |
+
# INDEXED_FILES.append({
|
| 268 |
+
# "name": name,
|
| 269 |
+
# "size_kb": round(len(b) / 1024, 1),
|
| 270 |
+
# "pages": len(pages),
|
| 271 |
+
# "chunks": file_chunks,
|
| 272 |
+
# })
|
| 273 |
+
|
| 274 |
+
# CHUNKS = all_chunks
|
| 275 |
+
# CHUNK_SOURCES = all_sources
|
| 276 |
+
# CHUNK_PAGES = all_pages
|
| 277 |
+
|
| 278 |
+
# if not CHUNKS:
|
| 279 |
+
# return "Could not extract any text from the PDFs.", "No files indexed."
|
| 280 |
+
|
| 281 |
+
# EMBEDDINGS = get_embedder().encode(CHUNKS, convert_to_numpy=True).astype("float32")
|
| 282 |
+
# cache_save(cache_key, EMBEDDINGS, CHUNKS, CHUNK_SOURCES, CHUNK_PAGES)
|
| 283 |
# build_faiss(EMBEDDINGS)
|
| 284 |
|
| 285 |
+
# return (
|
| 286 |
+
# f"Indexed {len(processed)} PDF(s) β {len(CHUNKS)} chunks ready.",
|
| 287 |
+
# _render_file_list(INDEXED_FILES)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
# )
|
| 289 |
|
| 290 |
+
# def _render_file_list(files: List[dict]) -> str:
|
| 291 |
+
# if not files:
|
| 292 |
+
# return "No files indexed yet."
|
| 293 |
+
# lines = []
|
| 294 |
+
# for f in files:
|
| 295 |
+
# parts = [f"π {f['name']} ({f['size_kb']} KB)"]
|
| 296 |
+
# if "pages" in f:
|
| 297 |
+
# parts.append(f"{f['pages']} pages")
|
| 298 |
+
# if "chunks" in f:
|
| 299 |
+
# parts.append(f"{f['chunks']} chunks")
|
| 300 |
+
# lines.append(" | ".join(parts))
|
| 301 |
+
# return "\n".join(lines)
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# # ---------------- Chat ----------------
|
| 305 |
+
# def chat(message: str, history: list):
|
| 306 |
+
# if not message.strip():
|
| 307 |
+
# return "", history
|
| 308 |
+
|
| 309 |
+
# if not CHUNKS:
|
| 310 |
+
# history.append((message, "No PDFs indexed yet. Please upload a PDF first."))
|
| 311 |
+
# return "", history
|
| 312 |
+
|
| 313 |
+
# results = search(message)
|
| 314 |
+
# if not results:
|
| 315 |
+
# history.append((message, "No relevant content found in the uploaded PDFs."))
|
| 316 |
+
# return "", history
|
| 317 |
+
|
| 318 |
+
# context_parts = []
|
| 319 |
+
# sources_used = []
|
| 320 |
+
# for r in results:
|
| 321 |
+
# context_parts.append(f"[From: {r['source']}, Page {r['page']}]\n{r['text']}")
|
| 322 |
+
# source_ref = f"{r['source']} (p.{r['page']})"
|
| 323 |
+
# if source_ref not in sources_used:
|
| 324 |
+
# sources_used.append(source_ref)
|
| 325 |
+
|
| 326 |
+
# context = "\n\n---\n\n".join(context_parts)
|
| 327 |
+
|
| 328 |
+
# # Multi-turn: include last 4 exchanges
|
| 329 |
+
# messages = []
|
| 330 |
+
# for user_msg, bot_msg in history[-4:]:
|
| 331 |
+
# messages.append({"role": "user", "content": user_msg})
|
| 332 |
+
# messages.append({"role": "assistant", "content": bot_msg})
|
| 333 |
+
|
| 334 |
+
# messages.append({
|
| 335 |
+
# "role": "user",
|
| 336 |
+
# "content": f"Context from PDFs:\n\n{context}\n\nQuestion: {message}"
|
| 337 |
+
# })
|
| 338 |
+
|
| 339 |
+
# answer = call_openrouter(messages)
|
| 340 |
+
|
| 341 |
+
# if sources_used:
|
| 342 |
+
# answer += f"\n\nSources: {', '.join(sources_used)}"
|
| 343 |
+
|
| 344 |
+
# history.append((message, answer))
|
| 345 |
+
# return "", history
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
# def clear_chat():
|
| 349 |
+
# return []
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
# # ---------------- Custom CSS ----------------
|
| 353 |
+
# custom_css = """
|
| 354 |
+
# @import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;600;700;800&family=DM+Mono:wght@300;400;500&display=swap');
|
| 355 |
+
|
| 356 |
+
# :root {
|
| 357 |
+
# --bg: #0d0f12;
|
| 358 |
+
# --surface: #13161b;
|
| 359 |
+
# --surface2: #1a1e26;
|
| 360 |
+
# --border: #252a35;
|
| 361 |
+
# --accent: #4fffb0;
|
| 362 |
+
# --accent2: #00c2ff;
|
| 363 |
+
# --text: #e8eaf0;
|
| 364 |
+
# --muted: #6b7280;
|
| 365 |
+
# }
|
| 366 |
+
|
| 367 |
+
# body, .gradio-container {
|
| 368 |
+
# background: var(--bg) !important;
|
| 369 |
+
# font-family: 'DM Mono', monospace !important;
|
| 370 |
+
# color: var(--text) !important;
|
| 371 |
+
# }
|
| 372 |
+
|
| 373 |
+
# .gradio-container {
|
| 374 |
+
# max-width: 1100px !important;
|
| 375 |
+
# margin: 0 auto !important;
|
| 376 |
+
# }
|
| 377 |
+
|
| 378 |
+
# .app-header {
|
| 379 |
+
# text-align: center;
|
| 380 |
+
# padding: 36px 0 28px;
|
| 381 |
+
# border-bottom: 1px solid var(--border);
|
| 382 |
+
# margin-bottom: 28px;
|
| 383 |
+
# }
|
| 384 |
+
|
| 385 |
+
# .app-header h1 {
|
| 386 |
+
# font-family: 'Syne', sans-serif;
|
| 387 |
+
# font-size: 2.4rem;
|
| 388 |
+
# font-weight: 800;
|
| 389 |
+
# background: linear-gradient(135deg, var(--accent), var(--accent2));
|
| 390 |
+
# -webkit-background-clip: text;
|
| 391 |
+
# -webkit-text-fill-color: transparent;
|
| 392 |
+
# background-clip: text;
|
| 393 |
+
# margin: 0 0 6px;
|
| 394 |
+
# letter-spacing: -1px;
|
| 395 |
+
# }
|
| 396 |
+
|
| 397 |
+
# .app-header p {
|
| 398 |
+
# color: var(--muted);
|
| 399 |
+
# font-size: 0.85rem;
|
| 400 |
+
# margin: 0;
|
| 401 |
+
# font-family: 'DM Mono', monospace;
|
| 402 |
+
# }
|
| 403 |
+
|
| 404 |
+
# .section-label {
|
| 405 |
+
# font-family: 'Syne', sans-serif;
|
| 406 |
+
# font-size: 0.7rem;
|
| 407 |
+
# font-weight: 700;
|
| 408 |
+
# letter-spacing: 2.5px;
|
| 409 |
+
# text-transform: uppercase;
|
| 410 |
+
# color: var(--accent);
|
| 411 |
+
# margin-bottom: 10px;
|
| 412 |
+
# }
|
| 413 |
+
|
| 414 |
+
# textarea, input[type="text"] {
|
| 415 |
+
# background: var(--surface2) !important;
|
| 416 |
+
# border: 1px solid var(--border) !important;
|
| 417 |
+
# border-radius: 8px !important;
|
| 418 |
+
# color: var(--text) !important;
|
| 419 |
+
# font-family: 'DM Mono', monospace !important;
|
| 420 |
+
# font-size: 0.87rem !important;
|
| 421 |
+
# }
|
| 422 |
+
# textarea:focus, input[type="text"]:focus {
|
| 423 |
+
# border-color: var(--accent) !important;
|
| 424 |
+
# box-shadow: 0 0 0 2px rgba(79,255,176,0.08) !important;
|
| 425 |
+
# }
|
| 426 |
+
|
| 427 |
+
# .footer-note {
|
| 428 |
+
# text-align: center;
|
| 429 |
+
# margin-top: 28px;
|
| 430 |
+
# color: #2d3340;
|
| 431 |
+
# font-size: 0.72rem;
|
| 432 |
+
# font-family: 'DM Mono', monospace;
|
| 433 |
+
# letter-spacing: 0.5px;
|
| 434 |
+
# }
|
| 435 |
+
# """
|
| 436 |
|
| 437 |
|
| 438 |
# # ---------------- Gradio UI ----------------
|
| 439 |
+
# with gr.Blocks(
|
| 440 |
+
# title="PDF RAG Bot",
|
| 441 |
+
# css=custom_css,
|
| 442 |
+
# theme=gr.themes.Base(
|
| 443 |
+
# primary_hue="emerald",
|
| 444 |
+
# neutral_hue="slate",
|
| 445 |
+
# )
|
| 446 |
+
# ) as demo:
|
| 447 |
+
|
| 448 |
+
# gr.HTML("""
|
| 449 |
+
# <div class="app-header">
|
| 450 |
+
# <h1>β‘ PDF RAG Bot</h1>
|
| 451 |
+
# <p>Upload PDFs Β· Semantic chunking Β· Ask anything Β· AI answers with page sources</p>
|
| 452 |
+
# </div>
|
| 453 |
+
# """)
|
| 454 |
+
|
| 455 |
+
# with gr.Row(equal_height=False):
|
| 456 |
+
|
| 457 |
+
# # ββ Left: Upload panel ββ
|
| 458 |
+
# with gr.Column(scale=1, min_width=280):
|
| 459 |
+
# gr.HTML('<div class="section-label">π Document Upload</div>')
|
| 460 |
+
|
| 461 |
+
# file_input = gr.File(
|
| 462 |
+
# label="Drop PDF files here",
|
| 463 |
+
# file_count="multiple",
|
| 464 |
+
# file_types=[".pdf"],
|
| 465 |
+
# )
|
| 466 |
+
# upload_btn = gr.Button("β‘ Upload & Index", variant="primary", size="lg")
|
| 467 |
+
|
| 468 |
+
# status = gr.Textbox(
|
| 469 |
+
# label="Status",
|
| 470 |
+
# interactive=False,
|
| 471 |
+
# lines=2,
|
| 472 |
+
# )
|
| 473 |
+
# file_list = gr.Textbox(
|
| 474 |
+
# label="Indexed Files",
|
| 475 |
+
# interactive=False,
|
| 476 |
+
# lines=6,
|
| 477 |
+
# placeholder="No files indexed yet...",
|
| 478 |
+
# )
|
| 479 |
+
|
| 480 |
+
# # ββ Right: Chat panel ββ
|
| 481 |
+
# with gr.Column(scale=2):
|
| 482 |
+
# gr.HTML('<div class="section-label">π¬ Chat with your PDFs</div>')
|
| 483 |
+
|
| 484 |
+
# chatbot = gr.Chatbot(
|
| 485 |
+
# label="",
|
| 486 |
+
# height=430,
|
| 487 |
+
# bubble_full_width=False,
|
| 488 |
+
# show_label=False,
|
| 489 |
+
# placeholder="Upload a PDF and start asking questions...",
|
| 490 |
+
# )
|
| 491 |
+
|
| 492 |
+
# with gr.Row():
|
| 493 |
+
# question = gr.Textbox(
|
| 494 |
+
# label="",
|
| 495 |
+
# placeholder="Ask something about your documents...",
|
| 496 |
+
# lines=2,
|
| 497 |
+
# scale=5,
|
| 498 |
+
# show_label=False,
|
| 499 |
+
# )
|
| 500 |
+
# with gr.Column(scale=1, min_width=90):
|
| 501 |
+
# send_btn = gr.Button("Send β€", variant="primary")
|
| 502 |
+
# clear_btn = gr.Button("Clear", variant="secondary")
|
| 503 |
+
|
| 504 |
+
# gr.HTML("""
|
| 505 |
+
# <div class="footer-note">
|
| 506 |
+
# Powered by OpenRouter Β· nvidia/nemotron-nano-12b Β·
|
| 507 |
+
# sentence-transformers Β· FAISS vector search
|
| 508 |
+
# </div>
|
| 509 |
+
# """)
|
| 510 |
+
|
| 511 |
+
# # Events
|
| 512 |
+
# upload_btn.click(
|
| 513 |
+
# upload_and_index,
|
| 514 |
+
# inputs=[file_input],
|
| 515 |
+
# outputs=[status, file_list],
|
| 516 |
+
# )
|
| 517 |
+
# send_btn.click(
|
| 518 |
+
# chat,
|
| 519 |
+
# inputs=[question, chatbot],
|
| 520 |
+
# outputs=[question, chatbot],
|
| 521 |
+
# )
|
| 522 |
+
# question.submit(
|
| 523 |
+
# chat,
|
| 524 |
+
# inputs=[question, chatbot],
|
| 525 |
+
# outputs=[question, chatbot],
|
| 526 |
+
# )
|
| 527 |
+
# clear_btn.click(clear_chat, outputs=[chatbot])
|
| 528 |
|
|
|
|
| 529 |
|
| 530 |
# if __name__ == "__main__":
|
| 531 |
# demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
|
| 532 |
|
| 533 |
import os
|
|
|
|
| 534 |
import hashlib
|
| 535 |
import shutil
|
| 536 |
from typing import List, Tuple
|
|
|
|
| 547 |
OPENROUTER_MODEL = "nvidia/nemotron-nano-12b-v2-vl:free"
|
| 548 |
EMBEDDING_MODEL_NAME = "paraphrase-MiniLM-L3-v2"
|
| 549 |
CACHE_DIR = "./cache"
|
| 550 |
+
CHUNK_SIZE = 300
|
| 551 |
+
CHUNK_OVERLAP = 50
|
| 552 |
+
TOP_K = 4
|
| 553 |
|
| 554 |
SYSTEM_PROMPT = (
|
| 555 |
"You are an expert document assistant. "
|
| 556 |
"Answer questions using ONLY the provided context from the uploaded PDFs. "
|
| 557 |
+
"Be concise, accurate, and mention which document your answer comes from. "
|
| 558 |
"Always respond in plain text. Avoid markdown formatting."
|
| 559 |
)
|
| 560 |
|
| 561 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 562 |
|
|
|
|
| 563 |
embedder = None
|
| 564 |
|
| 565 |
def get_embedder():
|
| 566 |
global embedder
|
| 567 |
if embedder is None:
|
|
|
|
| 568 |
embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
|
|
|
|
| 569 |
return embedder
|
| 570 |
|
|
|
|
| 571 |
CHUNKS: List[str] = []
|
| 572 |
CHUNK_SOURCES: List[str] = []
|
| 573 |
CHUNK_PAGES: List[int] = []
|
|
|
|
| 576 |
INDEXED_FILES: List[dict] = []
|
| 577 |
|
| 578 |
|
|
|
|
| 579 |
def clear_old_cache():
|
| 580 |
try:
|
| 581 |
if os.path.exists(CACHE_DIR):
|
| 582 |
shutil.rmtree(CACHE_DIR)
|
| 583 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 584 |
except Exception as e:
|
| 585 |
+
print(f"[Cache error] {e}")
|
| 586 |
|
| 587 |
|
|
|
|
| 588 |
def extract_pages_from_pdf(file_bytes: bytes) -> List[Tuple[int, str]]:
|
|
|
|
| 589 |
try:
|
| 590 |
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 591 |
pages = []
|
|
|
|
| 595 |
pages.append((i + 1, text))
|
| 596 |
return pages
|
| 597 |
except Exception as e:
|
| 598 |
+
return [(0, f"[PDF error] {e}")]
|
| 599 |
|
| 600 |
|
| 601 |
+
def chunk_text(text: str, source: str, page: int) -> List[Tuple[str, str, int]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
words = text.split()
|
| 603 |
chunks = []
|
| 604 |
+
step = CHUNK_SIZE - CHUNK_OVERLAP
|
| 605 |
for i in range(0, len(words), step):
|
| 606 |
+
chunk = " ".join(words[i: i + CHUNK_SIZE])
|
| 607 |
if len(chunk.strip()) > 50:
|
| 608 |
chunks.append((chunk, source, page))
|
| 609 |
+
if i + CHUNK_SIZE >= len(words):
|
| 610 |
break
|
| 611 |
return chunks
|
| 612 |
|
| 613 |
|
|
|
|
| 614 |
def make_cache_key(files: List[Tuple[str, bytes]]) -> str:
|
| 615 |
h = hashlib.sha256()
|
| 616 |
for name, b in sorted(files, key=lambda x: x[0]):
|
|
|
|
| 618 |
h.update(hashlib.sha256(b).digest())
|
| 619 |
return h.hexdigest()
|
| 620 |
|
| 621 |
+
def cache_save(cache_key, embeddings, chunks, sources, pages):
|
|
|
|
| 622 |
np.savez_compressed(
|
| 623 |
os.path.join(CACHE_DIR, f"{cache_key}.npz"),
|
| 624 |
embeddings=embeddings,
|
|
|
|
| 627 |
pages=np.array(pages),
|
| 628 |
)
|
| 629 |
|
| 630 |
+
def cache_load(cache_key):
|
| 631 |
path = os.path.join(CACHE_DIR, f"{cache_key}.npz")
|
| 632 |
if not os.path.exists(path):
|
| 633 |
return None
|
| 634 |
try:
|
| 635 |
data = np.load(path, allow_pickle=True)
|
| 636 |
+
return data["embeddings"], data["chunks"].tolist(), data["sources"].tolist(), data["pages"].tolist()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
except:
|
| 638 |
return None
|
| 639 |
|
| 640 |
|
| 641 |
+
def build_faiss(emb):
|
|
|
|
| 642 |
global FAISS_INDEX
|
| 643 |
if emb is None or len(emb) == 0:
|
| 644 |
FAISS_INDEX = None
|
|
|
|
| 648 |
index.add(emb)
|
| 649 |
FAISS_INDEX = index
|
| 650 |
|
| 651 |
+
def search(query: str):
|
| 652 |
if FAISS_INDEX is None or not CHUNKS:
|
| 653 |
return []
|
| 654 |
q_emb = get_embedder().encode([query], convert_to_numpy=True).astype("float32")
|
| 655 |
+
D, I = FAISS_INDEX.search(q_emb, TOP_K)
|
| 656 |
results = []
|
| 657 |
for d, i in zip(D[0], I[0]):
|
| 658 |
+
if 0 <= i < len(CHUNKS):
|
| 659 |
+
results.append({"text": CHUNKS[i], "source": CHUNK_SOURCES[i], "page": CHUNK_PAGES[i], "distance": float(d)})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
return results
|
| 661 |
|
| 662 |
|
|
|
|
| 663 |
def call_openrouter(messages: list) -> str:
|
| 664 |
if not OPENROUTER_API_KEY:
|
| 665 |
+
return "Error: OPENROUTER_API_KEY not set. Add it in HF Space secrets."
|
|
|
|
| 666 |
url = "https://openrouter.ai/api/v1/chat/completions"
|
| 667 |
+
headers = {"Authorization": f"Bearer {OPENROUTER_API_KEY}", "Content-Type": "application/json"}
|
| 668 |
+
payload = {"model": OPENROUTER_MODEL, "messages": [{"role": "system", "content": SYSTEM_PROMPT}] + messages}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
try:
|
| 670 |
r = requests.post(url, headers=headers, json=payload, timeout=60)
|
| 671 |
r.raise_for_status()
|
| 672 |
obj = r.json()
|
| 673 |
if "choices" in obj and obj["choices"]:
|
| 674 |
return obj["choices"][0]["message"]["content"].strip().replace("```", "")
|
| 675 |
+
return "[Unexpected API response]"
|
| 676 |
except Exception as e:
|
| 677 |
return f"[OpenRouter error] {e}"
|
| 678 |
|
| 679 |
|
|
|
|
| 680 |
def read_file_bytes(f) -> Tuple[str, bytes]:
|
| 681 |
if isinstance(f, tuple) and len(f) == 2 and isinstance(f[1], (bytes, bytearray)):
|
| 682 |
return f[0], bytes(f[1])
|
|
|
|
| 686 |
if isinstance(data, (bytes, bytearray)):
|
| 687 |
return name, bytes(data)
|
| 688 |
if isinstance(data, str):
|
| 689 |
+
try: return name, data.encode("utf-8")
|
| 690 |
+
except: pass
|
| 691 |
+
tmp = f.get("tmp_path") or f.get("path") or f.get("file")
|
| 692 |
+
if tmp and isinstance(tmp, str) and os.path.exists(tmp):
|
| 693 |
+
with open(tmp, "rb") as fh:
|
| 694 |
+
return os.path.basename(tmp), fh.read()
|
|
|
|
|
|
|
| 695 |
if hasattr(f, "name") and hasattr(f, "read"):
|
| 696 |
try:
|
| 697 |
name = os.path.basename(f.name) if getattr(f, "name", None) else "uploaded"
|
| 698 |
return name, f.read()
|
| 699 |
+
except: pass
|
|
|
|
| 700 |
if hasattr(f, "name") and hasattr(f, "value"):
|
| 701 |
name = os.path.basename(getattr(f, "name") or "uploaded")
|
| 702 |
v = getattr(f, "value")
|
| 703 |
+
if isinstance(v, (bytes, bytearray)): return name, bytes(v)
|
| 704 |
+
if isinstance(v, str): return name, v.encode("utf-8")
|
|
|
|
|
|
|
| 705 |
if isinstance(f, str) and os.path.exists(f):
|
| 706 |
with open(f, "rb") as fh:
|
| 707 |
return os.path.basename(f), fh.read()
|
| 708 |
+
raise ValueError(f"Unsupported file type: {type(f)}")
|
| 709 |
|
| 710 |
|
|
|
|
| 711 |
def upload_and_index(files):
|
| 712 |
global CHUNKS, CHUNK_SOURCES, CHUNK_PAGES, EMBEDDINGS, INDEXED_FILES
|
|
|
|
| 713 |
if not files:
|
| 714 |
+
return _status_html("warning", "No files selected. Please upload at least one PDF."), _file_cards([])
|
| 715 |
|
| 716 |
clear_old_cache()
|
|
|
|
| 717 |
processed = []
|
| 718 |
if not isinstance(files, (list, tuple)):
|
| 719 |
files = [files]
|
|
|
|
| 723 |
name, b = read_file_bytes(f)
|
| 724 |
processed.append((name, b))
|
| 725 |
except ValueError as e:
|
| 726 |
+
return _status_html("error", f"Upload error: {e}"), _file_cards([])
|
| 727 |
|
| 728 |
cache_key = make_cache_key(processed)
|
| 729 |
cached = cache_load(cache_key)
|
|
|
|
| 734 |
build_faiss(EMBEDDINGS)
|
| 735 |
INDEXED_FILES = [{"name": n, "size_kb": round(len(b)/1024, 1)} for n, b in processed]
|
| 736 |
return (
|
| 737 |
+
_status_html("success", f"Loaded from cache β {len(CHUNKS)} chunks across {len(processed)} PDF(s). Ready to chat!"),
|
| 738 |
+
_file_cards(INDEXED_FILES)
|
| 739 |
)
|
| 740 |
|
| 741 |
all_chunks, all_sources, all_pages = [], [], []
|
|
|
|
| 750 |
all_sources.append(src)
|
| 751 |
all_pages.append(pg)
|
| 752 |
file_chunks += 1
|
| 753 |
+
INDEXED_FILES.append({"name": name, "size_kb": round(len(b)/1024, 1), "pages": len(pages), "chunks": file_chunks})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 754 |
|
| 755 |
CHUNKS = all_chunks
|
| 756 |
CHUNK_SOURCES = all_sources
|
| 757 |
CHUNK_PAGES = all_pages
|
| 758 |
|
| 759 |
if not CHUNKS:
|
| 760 |
+
return _status_html("error", "Could not extract text from the uploaded PDFs."), _file_cards([])
|
| 761 |
|
| 762 |
EMBEDDINGS = get_embedder().encode(CHUNKS, convert_to_numpy=True).astype("float32")
|
| 763 |
cache_save(cache_key, EMBEDDINGS, CHUNKS, CHUNK_SOURCES, CHUNK_PAGES)
|
| 764 |
build_faiss(EMBEDDINGS)
|
| 765 |
|
| 766 |
return (
|
| 767 |
+
_status_html("success", f"Successfully indexed {len(processed)} PDF(s) β {len(CHUNKS)} semantic chunks ready!"),
|
| 768 |
+
_file_cards(INDEXED_FILES)
|
| 769 |
)
|
| 770 |
|
| 771 |
+
|
| 772 |
+
def _status_html(kind: str, msg: str) -> str:
|
| 773 |
+
colors = {
|
| 774 |
+
"success": ("#1a3a2a", "#4ade80", "β¦"),
|
| 775 |
+
"warning": ("#3a2e1a", "#fbbf24", "β "),
|
| 776 |
+
"error": ("#3a1a1a", "#f87171", "β"),
|
| 777 |
+
}
|
| 778 |
+
bg, color, icon = colors.get(kind, colors["success"])
|
| 779 |
+
return f"""<div style="background:{bg};border:1px solid {color}33;border-radius:10px;padding:12px 16px;font-family:'Courier Prime',monospace;font-size:0.85rem;color:{color};display:flex;align-items:center;gap:10px;"><span style="font-size:1rem">{icon}</span>{msg}</div>"""
|
| 780 |
+
|
| 781 |
+
|
| 782 |
+
def _file_cards(files: List[dict]) -> str:
|
| 783 |
if not files:
|
| 784 |
+
return """<div style="font-family:'Courier Prime',monospace;color:#6b7280;font-size:0.8rem;text-align:center;padding:20px 0;">No files indexed yet</div>"""
|
| 785 |
+
cards = []
|
| 786 |
for f in files:
|
| 787 |
+
name = f["name"]
|
| 788 |
+
size = f.get("size_kb", "?")
|
| 789 |
+
pages = f.get("pages", "?")
|
| 790 |
+
chunks = f.get("chunks", "?")
|
| 791 |
+
cards.append(f"""
|
| 792 |
+
<div style="background:#1c1a16;border:1px solid #3a3020;border-radius:10px;padding:12px 14px;margin-bottom:8px;transition:border-color 0.2s;">
|
| 793 |
+
<div style="font-family:'Playfair Display',serif;font-size:0.9rem;color:#f5e6c8;font-weight:600;margin-bottom:6px;white-space:nowrap;overflow:hidden;text-overflow:ellipsis;">π {name}</div>
|
| 794 |
+
<div style="display:flex;gap:8px;flex-wrap:wrap;">
|
| 795 |
+
<span style="background:#2a2218;border:1px solid #4a3a20;border-radius:20px;padding:2px 10px;font-size:0.72rem;color:#c9a85c;font-family:'Courier Prime',monospace;">{size} KB</span>
|
| 796 |
+
<span style="background:#2a2218;border:1px solid #4a3a20;border-radius:20px;padding:2px 10px;font-size:0.72rem;color:#c9a85c;font-family:'Courier Prime',monospace;">{pages} pages</span>
|
| 797 |
+
<span style="background:#2a2218;border:1px solid #4a3a20;border-radius:20px;padding:2px 10px;font-size:0.72rem;color:#c9a85c;font-family:'Courier Prime',monospace;">{chunks} chunks</span>
|
| 798 |
+
</div>
|
| 799 |
+
</div>""")
|
| 800 |
+
return "".join(cards)
|
| 801 |
|
| 802 |
|
|
|
|
| 803 |
def chat(message: str, history: list):
|
| 804 |
if not message.strip():
|
| 805 |
return "", history
|
|
|
|
| 806 |
if not CHUNKS:
|
| 807 |
+
history.append((message, "Please upload and index a PDF first before asking questions."))
|
| 808 |
return "", history
|
| 809 |
|
| 810 |
results = search(message)
|
| 811 |
if not results:
|
| 812 |
+
history.append((message, "No relevant content found in the uploaded PDFs for that question."))
|
| 813 |
return "", history
|
| 814 |
|
| 815 |
context_parts = []
|
| 816 |
sources_used = []
|
| 817 |
for r in results:
|
| 818 |
context_parts.append(f"[From: {r['source']}, Page {r['page']}]\n{r['text']}")
|
| 819 |
+
ref = f"{r['source']} (p.{r['page']})"
|
| 820 |
+
if ref not in sources_used:
|
| 821 |
+
sources_used.append(ref)
|
| 822 |
|
| 823 |
context = "\n\n---\n\n".join(context_parts)
|
|
|
|
|
|
|
| 824 |
messages = []
|
| 825 |
+
for u, b in history[-4:]:
|
| 826 |
+
messages.append({"role": "user", "content": u})
|
| 827 |
+
messages.append({"role": "assistant", "content": b})
|
| 828 |
+
messages.append({"role": "user", "content": f"Context:\n\n{context}\n\nQuestion: {message}"})
|
|
|
|
|
|
|
|
|
|
|
|
|
| 829 |
|
| 830 |
answer = call_openrouter(messages)
|
|
|
|
| 831 |
if sources_used:
|
| 832 |
+
answer += f"\n\nβ Sources: {', '.join(sources_used)}"
|
| 833 |
|
| 834 |
history.append((message, answer))
|
| 835 |
return "", history
|
|
|
|
| 839 |
return []
|
| 840 |
|
| 841 |
|
| 842 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 843 |
+
# CSS β Warm Ink editorial luxury theme
|
| 844 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 845 |
+
CSS = """
|
| 846 |
+
@import url('https://fonts.googleapis.com/css2?family=Playfair+Display:ital,wght@0,400;0,700;0,900;1,400&family=Courier+Prime:wght@400;700&display=swap');
|
| 847 |
+
|
| 848 |
+
/* ββ Reset & base ββ */
|
| 849 |
+
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
| 850 |
|
| 851 |
:root {
|
| 852 |
+
--ink: #0e0c08;
|
| 853 |
+
--paper: #16130d;
|
| 854 |
+
--card: #1c1a14;
|
| 855 |
+
--card2: #232018;
|
| 856 |
+
--border: #332d1e;
|
| 857 |
+
--gold: #c9a85c;
|
| 858 |
+
--gold2: #e8c87a;
|
| 859 |
+
--cream: #f5e6c8;
|
| 860 |
+
--muted: #7a6d55;
|
| 861 |
+
--red: #c0392b;
|
| 862 |
}
|
| 863 |
|
| 864 |
+
html, body { background: var(--ink) !important; }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 865 |
|
| 866 |
.gradio-container {
|
| 867 |
+
background: var(--ink) !important;
|
| 868 |
+
max-width: 1160px !important;
|
| 869 |
margin: 0 auto !important;
|
| 870 |
+
padding: 0 !important;
|
| 871 |
+
font-family: 'Courier Prime', monospace !important;
|
| 872 |
}
|
| 873 |
|
| 874 |
+
/* ββ Masthead ββ */
|
| 875 |
+
.masthead {
|
| 876 |
+
position: relative;
|
| 877 |
text-align: center;
|
| 878 |
+
padding: 52px 24px 36px;
|
| 879 |
+
border-bottom: 2px solid var(--border);
|
| 880 |
+
overflow: hidden;
|
| 881 |
}
|
| 882 |
+
.masthead::before {
|
| 883 |
+
content: '';
|
| 884 |
+
position: absolute;
|
| 885 |
+
inset: 0;
|
| 886 |
+
background: radial-gradient(ellipse 70% 60% at 50% 0%, #3a2a0a22 0%, transparent 70%);
|
| 887 |
+
pointer-events: none;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 888 |
}
|
| 889 |
+
.masthead-kicker {
|
| 890 |
+
font-family: 'Courier Prime', monospace;
|
| 891 |
+
font-size: 0.65rem;
|
| 892 |
+
letter-spacing: 5px;
|
| 893 |
+
text-transform: uppercase;
|
| 894 |
+
color: var(--gold);
|
| 895 |
+
margin-bottom: 10px;
|
| 896 |
+
}
|
| 897 |
+
.masthead h1 {
|
| 898 |
+
font-family: 'Playfair Display', serif !important;
|
| 899 |
+
font-size: clamp(2.4rem, 5vw, 4rem) !important;
|
| 900 |
+
font-weight: 900 !important;
|
| 901 |
+
color: var(--cream) !important;
|
| 902 |
+
line-height: 1.05 !important;
|
| 903 |
+
letter-spacing: -1.5px !important;
|
| 904 |
+
margin-bottom: 10px !important;
|
| 905 |
+
}
|
| 906 |
+
.masthead h1 em {
|
| 907 |
+
font-style: italic;
|
| 908 |
+
color: var(--gold2);
|
| 909 |
+
}
|
| 910 |
+
.masthead-sub {
|
| 911 |
+
font-family: 'Courier Prime', monospace;
|
| 912 |
+
font-size: 0.78rem;
|
| 913 |
color: var(--muted);
|
| 914 |
+
letter-spacing: 1px;
|
| 915 |
+
}
|
| 916 |
+
.masthead-rule {
|
| 917 |
+
display: flex;
|
| 918 |
+
align-items: center;
|
| 919 |
+
gap: 12px;
|
| 920 |
+
margin: 18px auto 0;
|
| 921 |
+
max-width: 320px;
|
| 922 |
+
}
|
| 923 |
+
.masthead-rule hr {
|
| 924 |
+
flex: 1;
|
| 925 |
+
border: none;
|
| 926 |
+
border-top: 1px solid var(--border);
|
| 927 |
+
}
|
| 928 |
+
.masthead-rule span {
|
| 929 |
+
color: var(--gold);
|
| 930 |
+
font-size: 0.65rem;
|
| 931 |
+
letter-spacing: 3px;
|
| 932 |
+
text-transform: uppercase;
|
| 933 |
+
white-space: nowrap;
|
| 934 |
+
font-family: 'Courier Prime', monospace;
|
| 935 |
}
|
| 936 |
|
| 937 |
+
/* ββ Layout columns ββ */
|
| 938 |
+
.layout-wrap {
|
| 939 |
+
display: grid;
|
| 940 |
+
grid-template-columns: 320px 1fr;
|
| 941 |
+
gap: 0;
|
| 942 |
+
min-height: 620px;
|
| 943 |
+
}
|
| 944 |
+
.left-col {
|
| 945 |
+
border-right: 1px solid var(--border);
|
| 946 |
+
padding: 28px 24px;
|
| 947 |
+
background: var(--paper);
|
| 948 |
+
}
|
| 949 |
+
.right-col {
|
| 950 |
+
padding: 28px 28px 20px;
|
| 951 |
+
display: flex;
|
| 952 |
+
flex-direction: column;
|
| 953 |
+
background: var(--ink);
|
| 954 |
+
}
|
| 955 |
+
|
| 956 |
+
/* ββ Section labels ββ */
|
| 957 |
+
.sec-label {
|
| 958 |
+
font-family: 'Courier Prime', monospace;
|
| 959 |
+
font-size: 0.6rem;
|
| 960 |
+
letter-spacing: 4px;
|
| 961 |
text-transform: uppercase;
|
| 962 |
+
color: var(--gold);
|
| 963 |
+
border-bottom: 1px solid var(--border);
|
| 964 |
+
padding-bottom: 8px;
|
| 965 |
+
margin-bottom: 16px;
|
| 966 |
+
}
|
| 967 |
+
|
| 968 |
+
/* ββ File upload zone ββ */
|
| 969 |
+
.upload-zone .wrap {
|
| 970 |
+
background: var(--card) !important;
|
| 971 |
+
border: 2px dashed var(--border) !important;
|
| 972 |
+
border-radius: 12px !important;
|
| 973 |
+
transition: border-color 0.25s, background 0.25s !important;
|
| 974 |
+
min-height: 120px !important;
|
| 975 |
+
}
|
| 976 |
+
.upload-zone .wrap:hover {
|
| 977 |
+
border-color: var(--gold) !important;
|
| 978 |
+
background: var(--card2) !important;
|
| 979 |
+
}
|
| 980 |
+
.upload-zone .wrap svg { color: var(--muted) !important; }
|
| 981 |
+
.upload-zone .wrap p, .upload-zone label {
|
| 982 |
+
color: var(--muted) !important;
|
| 983 |
+
font-family: 'Courier Prime', monospace !important;
|
| 984 |
+
font-size: 0.8rem !important;
|
| 985 |
+
}
|
| 986 |
+
|
| 987 |
+
/* ββ Buttons ββ */
|
| 988 |
+
.btn-primary {
|
| 989 |
+
background: linear-gradient(135deg, #b8923a, #e8c87a) !important;
|
| 990 |
+
color: #0e0c08 !important;
|
| 991 |
+
font-family: 'Playfair Display', serif !important;
|
| 992 |
+
font-weight: 700 !important;
|
| 993 |
+
font-size: 0.88rem !important;
|
| 994 |
+
letter-spacing: 0.5px !important;
|
| 995 |
+
border: none !important;
|
| 996 |
+
border-radius: 8px !important;
|
| 997 |
+
padding: 11px 0 !important;
|
| 998 |
+
width: 100% !important;
|
| 999 |
+
cursor: pointer !important;
|
| 1000 |
+
transition: opacity 0.2s, transform 0.1s !important;
|
| 1001 |
+
box-shadow: 0 4px 20px #c9a85c22 !important;
|
| 1002 |
}
|
| 1003 |
+
.btn-primary:hover { opacity: 0.88 !important; transform: translateY(-1px) !important; }
|
| 1004 |
+
.btn-primary:active { transform: translateY(0) !important; }
|
| 1005 |
+
|
| 1006 |
+
.btn-send {
|
| 1007 |
+
background: linear-gradient(135deg, #b8923a, #e8c87a) !important;
|
| 1008 |
+
color: #0e0c08 !important;
|
| 1009 |
+
font-family: 'Playfair Display', serif !important;
|
| 1010 |
+
font-weight: 700 !important;
|
| 1011 |
+
font-size: 0.85rem !important;
|
| 1012 |
+
border: none !important;
|
| 1013 |
+
border-radius: 8px !important;
|
| 1014 |
+
padding: 10px 22px !important;
|
| 1015 |
+
cursor: pointer !important;
|
| 1016 |
+
transition: opacity 0.2s !important;
|
| 1017 |
+
white-space: nowrap !important;
|
| 1018 |
+
}
|
| 1019 |
+
.btn-send:hover { opacity: 0.85 !important; }
|
| 1020 |
|
| 1021 |
+
.btn-clear {
|
| 1022 |
+
background: transparent !important;
|
| 1023 |
+
color: var(--muted) !important;
|
| 1024 |
border: 1px solid var(--border) !important;
|
| 1025 |
+
font-family: 'Courier Prime', monospace !important;
|
| 1026 |
+
font-size: 0.78rem !important;
|
| 1027 |
border-radius: 8px !important;
|
| 1028 |
+
padding: 10px 16px !important;
|
| 1029 |
+
cursor: pointer !important;
|
| 1030 |
+
transition: border-color 0.2s, color 0.2s !important;
|
| 1031 |
+
white-space: nowrap !important;
|
| 1032 |
+
}
|
| 1033 |
+
.btn-clear:hover { border-color: var(--gold) !important; color: var(--gold) !important; }
|
| 1034 |
+
|
| 1035 |
+
/* ββ Chatbot ββ */
|
| 1036 |
+
.chatbot-wrap .wrap {
|
| 1037 |
+
background: transparent !important;
|
| 1038 |
+
border: none !important;
|
| 1039 |
+
}
|
| 1040 |
+
.chatbot-wrap {
|
| 1041 |
+
flex: 1;
|
| 1042 |
+
margin-bottom: 16px;
|
| 1043 |
+
}
|
| 1044 |
+
|
| 1045 |
+
/* User bubble */
|
| 1046 |
+
.message-wrap .user, .message.user {
|
| 1047 |
+
background: var(--card2) !important;
|
| 1048 |
+
border: 1px solid var(--border) !important;
|
| 1049 |
+
border-radius: 14px 14px 4px 14px !important;
|
| 1050 |
+
color: var(--cream) !important;
|
| 1051 |
+
font-family: 'Courier Prime', monospace !important;
|
| 1052 |
+
font-size: 0.87rem !important;
|
| 1053 |
+
padding: 12px 16px !important;
|
| 1054 |
+
max-width: 80% !important;
|
| 1055 |
+
margin-left: auto !important;
|
| 1056 |
+
}
|
| 1057 |
+
|
| 1058 |
+
/* Bot bubble */
|
| 1059 |
+
.message-wrap .bot, .message.bot {
|
| 1060 |
+
background: var(--card) !important;
|
| 1061 |
+
border: 1px solid #3a3020 !important;
|
| 1062 |
+
border-left: 3px solid var(--gold) !important;
|
| 1063 |
+
border-radius: 4px 14px 14px 14px !important;
|
| 1064 |
+
color: var(--cream) !important;
|
| 1065 |
+
font-family: 'Courier Prime', monospace !important;
|
| 1066 |
font-size: 0.87rem !important;
|
| 1067 |
+
padding: 12px 16px !important;
|
| 1068 |
+
max-width: 88% !important;
|
| 1069 |
+
line-height: 1.6 !important;
|
| 1070 |
}
|
| 1071 |
+
|
| 1072 |
+
/* Empty state placeholder */
|
| 1073 |
+
.chatbot-wrap .placeholder {
|
| 1074 |
+
color: var(--muted) !important;
|
| 1075 |
+
font-family: 'Courier Prime', monospace !important;
|
| 1076 |
+
font-size: 0.82rem !important;
|
| 1077 |
}
|
| 1078 |
|
| 1079 |
+
/* ββ Question input ββ */
|
| 1080 |
+
.question-input textarea {
|
| 1081 |
+
background: var(--card) !important;
|
| 1082 |
+
border: 1px solid var(--border) !important;
|
| 1083 |
+
border-radius: 10px !important;
|
| 1084 |
+
color: var(--cream) !important;
|
| 1085 |
+
font-family: 'Courier Prime', monospace !important;
|
| 1086 |
+
font-size: 0.88rem !important;
|
| 1087 |
+
padding: 12px 14px !important;
|
| 1088 |
+
resize: none !important;
|
| 1089 |
+
transition: border-color 0.2s !important;
|
| 1090 |
+
line-height: 1.5 !important;
|
| 1091 |
+
}
|
| 1092 |
+
.question-input textarea:focus {
|
| 1093 |
+
border-color: var(--gold) !important;
|
| 1094 |
+
outline: none !important;
|
| 1095 |
+
box-shadow: 0 0 0 3px #c9a85c15 !important;
|
| 1096 |
+
}
|
| 1097 |
+
.question-input textarea::placeholder { color: var(--muted) !important; }
|
| 1098 |
+
.question-input label { display: none !important; }
|
| 1099 |
+
|
| 1100 |
+
/* ββ Scrollbars ββ */
|
| 1101 |
+
::-webkit-scrollbar { width: 5px; height: 5px; }
|
| 1102 |
+
::-webkit-scrollbar-track { background: var(--paper); }
|
| 1103 |
+
::-webkit-scrollbar-thumb { background: var(--border); border-radius: 4px; }
|
| 1104 |
+
::-webkit-scrollbar-thumb:hover { background: var(--gold); }
|
| 1105 |
+
|
| 1106 |
+
/* ββ Footer ββ */
|
| 1107 |
+
.site-footer {
|
| 1108 |
text-align: center;
|
| 1109 |
+
padding: 16px;
|
| 1110 |
+
border-top: 1px solid var(--border);
|
| 1111 |
+
font-family: 'Courier Prime', monospace;
|
| 1112 |
+
font-size: 0.68rem;
|
| 1113 |
+
color: #3a3020;
|
| 1114 |
+
letter-spacing: 1.5px;
|
| 1115 |
+
text-transform: uppercase;
|
| 1116 |
}
|
|
|
|
| 1117 |
|
| 1118 |
+
/* Gradio internals cleanup */
|
| 1119 |
+
.gr-padded { padding: 0 !important; }
|
| 1120 |
+
footer.svelte-1ax1toq { display: none !important; }
|
| 1121 |
+
.hide-label label { display: none !important; }
|
| 1122 |
+
"""
|
| 1123 |
|
| 1124 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1125 |
+
# Gradio UI
|
| 1126 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1127 |
with gr.Blocks(
|
| 1128 |
title="PDF RAG Bot",
|
| 1129 |
+
css=CSS,
|
| 1130 |
theme=gr.themes.Base(
|
| 1131 |
+
primary_hue="amber",
|
| 1132 |
+
neutral_hue="stone",
|
| 1133 |
+
),
|
| 1134 |
) as demo:
|
| 1135 |
|
| 1136 |
+
# ββ Masthead ββ
|
| 1137 |
gr.HTML("""
|
| 1138 |
+
<div class="masthead">
|
| 1139 |
+
<div class="masthead-kicker">Intelligent Document Analysis</div>
|
| 1140 |
+
<h1>The <em>PDF</em> Oracle</h1>
|
| 1141 |
+
<div class="masthead-sub">Upload Β· Index Β· Interrogate Β· Discover</div>
|
| 1142 |
+
<div class="masthead-rule">
|
| 1143 |
+
<hr/><span>est. 2025</span><hr/>
|
| 1144 |
+
</div>
|
| 1145 |
</div>
|
| 1146 |
""")
|
| 1147 |
|
| 1148 |
+
# ββ Two-column layout ββ
|
| 1149 |
+
with gr.Row(equal_height=True):
|
| 1150 |
|
| 1151 |
+
# LEFT β Upload panel
|
| 1152 |
+
with gr.Column(scale=0, min_width=300):
|
| 1153 |
+
gr.HTML('<div class="sec-label">Β§ I β Document Vault</div>')
|
| 1154 |
|
| 1155 |
file_input = gr.File(
|
| 1156 |
+
label="Drop PDFs here or click to browse",
|
| 1157 |
file_count="multiple",
|
| 1158 |
file_types=[".pdf"],
|
| 1159 |
+
elem_classes=["upload-zone"],
|
| 1160 |
+
)
|
| 1161 |
+
|
| 1162 |
+
upload_btn = gr.Button(
|
| 1163 |
+
"β¬ Index Documents",
|
| 1164 |
+
variant="primary",
|
| 1165 |
+
elem_classes=["btn-primary"],
|
| 1166 |
)
|
|
|
|
| 1167 |
|
| 1168 |
+
status_html = gr.HTML(
|
| 1169 |
+
value="""<div style="font-family:'Courier Prime',monospace;color:#5a4d35;font-size:0.78rem;text-align:center;padding:10px 0;letter-spacing:1px;">Awaiting documents...</div>"""
|
|
|
|
|
|
|
| 1170 |
)
|
| 1171 |
+
|
| 1172 |
+
gr.HTML('<div class="sec-label" style="margin-top:20px;">Β§ II β Indexed Files</div>')
|
| 1173 |
+
|
| 1174 |
+
file_cards_html = gr.HTML(
|
| 1175 |
+
value="""<div style="font-family:'Courier Prime',monospace;color:#5a4d35;font-size:0.78rem;text-align:center;padding:16px 0;">No files indexed yet</div>"""
|
| 1176 |
)
|
| 1177 |
|
| 1178 |
+
# RIGHT β Chat panel
|
| 1179 |
+
with gr.Column(scale=1):
|
| 1180 |
+
gr.HTML('<div class="sec-label">Β§ III β Inquiry Chamber</div>')
|
| 1181 |
|
| 1182 |
chatbot = gr.Chatbot(
|
| 1183 |
label="",
|
| 1184 |
+
height=420,
|
| 1185 |
bubble_full_width=False,
|
| 1186 |
show_label=False,
|
| 1187 |
+
elem_classes=["chatbot-wrap"],
|
| 1188 |
+
placeholder="β¦ Ask anything about your uploaded documents β¦",
|
| 1189 |
)
|
| 1190 |
|
| 1191 |
+
with gr.Row(equal_height=True):
|
| 1192 |
question = gr.Textbox(
|
| 1193 |
label="",
|
| 1194 |
+
placeholder="Pose your question to the oracle...",
|
| 1195 |
lines=2,
|
| 1196 |
scale=5,
|
| 1197 |
show_label=False,
|
| 1198 |
+
elem_classes=["question-input"],
|
| 1199 |
)
|
| 1200 |
+
with gr.Column(scale=0, min_width=120):
|
| 1201 |
+
send_btn = gr.Button("Ask β¦", variant="primary", elem_classes=["btn-send"])
|
| 1202 |
+
clear_btn = gr.Button("Clear", variant="secondary", elem_classes=["btn-clear"])
|
| 1203 |
|
| 1204 |
+
# ββ Footer ββ
|
| 1205 |
gr.HTML("""
|
| 1206 |
+
<div class="site-footer">
|
| 1207 |
+
Powered by OpenRouter Β· NVIDIA Nemotron Β·
|
| 1208 |
+
Sentence-Transformers Β· FAISS Β· PyMuPDF
|
| 1209 |
</div>
|
| 1210 |
""")
|
| 1211 |
|
| 1212 |
+
# ββ Events ββ
|
| 1213 |
upload_btn.click(
|
| 1214 |
upload_and_index,
|
| 1215 |
inputs=[file_input],
|
| 1216 |
+
outputs=[status_html, file_cards_html],
|
| 1217 |
)
|
| 1218 |
send_btn.click(
|
| 1219 |
chat,
|