Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +160 -11
src/streamlit_app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
# -*- coding: utf-8 -*-
|
| 3 |
|
|
@@ -32,6 +33,7 @@ from textwrap import dedent
|
|
| 32 |
APP_PY = dedent(r'''
|
| 33 |
import os
|
| 34 |
import time
|
|
|
|
| 35 |
import traceback
|
| 36 |
from typing import List
|
| 37 |
|
|
@@ -45,6 +47,7 @@ from langchain_community.vectorstores import Chroma
|
|
| 45 |
|
| 46 |
from huggingface_hub import hf_hub_download
|
| 47 |
from llama_cpp import Llama
|
|
|
|
| 48 |
|
| 49 |
# -----------------------------
|
| 50 |
# Config
|
|
@@ -98,22 +101,99 @@ QNA_TEMPLATE = """[SYSTEM]
|
|
| 98 |
"""
|
| 99 |
|
| 100 |
# -----------------------------
|
| 101 |
-
#
|
| 102 |
# -----------------------------
|
| 103 |
def list_pdfs(folder: str):
|
| 104 |
os.makedirs(folder, exist_ok=True)
|
| 105 |
return [os.path.join(folder, f) for f in os.listdir(folder) if f.lower().endswith(".pdf")]
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
def build_or_load_vectorstore():
|
| 108 |
"""Load existing Chroma DB if present; else build from PDFs in data/."""
|
|
|
|
| 109 |
if os.path.isdir(DB_DIR) and os.listdir(DB_DIR):
|
| 110 |
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 111 |
return Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
|
| 112 |
|
| 113 |
pdfs = list_pdfs(DOCS_DIR)
|
| 114 |
if not pdfs:
|
| 115 |
-
raise FileNotFoundError(
|
|
|
|
|
|
|
|
|
|
| 116 |
|
|
|
|
| 117 |
docs = []
|
| 118 |
for p in pdfs:
|
| 119 |
loader = PyMuPDFLoader(p)
|
|
@@ -122,11 +202,18 @@ def build_or_load_vectorstore():
|
|
| 122 |
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
|
| 123 |
chunks = splitter.split_documents(docs)
|
| 124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 126 |
vs = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_DIR)
|
| 127 |
vs.persist()
|
| 128 |
return vs
|
| 129 |
|
|
|
|
|
|
|
|
|
|
| 130 |
def load_llm():
|
| 131 |
"""
|
| 132 |
Try to load primary (Mistral model). If it fails (OOM on CPU Space),
|
|
@@ -237,6 +324,7 @@ pymupdf==1.23.26
|
|
| 237 |
# Utils
|
| 238 |
numpy==1.26.4
|
| 239 |
pandas==2.1.4
|
|
|
|
| 240 |
''').strip() + "\n"
|
| 241 |
|
| 242 |
RUNTIME_TXT = "python-3.10\n"
|
|
@@ -244,16 +332,77 @@ RUNTIME_TXT = "python-3.10\n"
|
|
| 244 |
DATA_README = dedent(r'''
|
| 245 |
# Data folder
|
| 246 |
|
| 247 |
-
|
| 248 |
Place your NITDA PDFs here. Example filenames:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
-
python build_and_deploy_nitda_rag.py \
|
| 251 |
-
--space-id nwamgbowo/nitda-rag \
|
| 252 |
-
--pdf "/path/to/NITDA-ACT-2007-2019-Edition1.pdf" \
|
| 253 |
-
--pdf "/path/to/Digital-Literacy-Framework.pdf" \
|
| 254 |
-
--pdf "/path/to/FrameworkAndGuidelinesForPublicInternetAccessPIA1.pdf" \
|
| 255 |
-
--pdf "/path/to/NATIONAL-REGULATORY-GUIDELINE-FOR-ELECTRONIC-INVOICING-IN-NIGERIA-2025.pdf"
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
-
|
| 259 |
-
)
|
|
|
|
| 1 |
+
|
| 2 |
#!/usr/bin/env python3
|
| 3 |
# -*- coding: utf-8 -*-
|
| 4 |
|
|
|
|
| 33 |
APP_PY = dedent(r'''
|
| 34 |
import os
|
| 35 |
import time
|
| 36 |
+
import shutil
|
| 37 |
import traceback
|
| 38 |
from typing import List
|
| 39 |
|
|
|
|
| 47 |
|
| 48 |
from huggingface_hub import hf_hub_download
|
| 49 |
from llama_cpp import Llama
|
| 50 |
+
import requests
|
| 51 |
|
| 52 |
# -----------------------------
|
| 53 |
# Config
|
|
|
|
| 101 |
"""
|
| 102 |
|
| 103 |
# -----------------------------
|
| 104 |
+
# Auto-copy & seeding (STARTUP)
|
| 105 |
# -----------------------------
|
| 106 |
def list_pdfs(folder: str):
|
| 107 |
os.makedirs(folder, exist_ok=True)
|
| 108 |
return [os.path.join(folder, f) for f in os.listdir(folder) if f.lower().endswith(".pdf")]
|
| 109 |
|
| 110 |
+
def seed_data_from_urls_if_empty():
|
| 111 |
+
"""
|
| 112 |
+
If data/ has no PDFs and SEED_PDF_URLS is set (comma-separated URLs),
|
| 113 |
+
download those PDFs into data/.
|
| 114 |
+
"""
|
| 115 |
+
os.makedirs(DOCS_DIR, exist_ok=True)
|
| 116 |
+
existing = [f for f in os.listdir(DOCS_DIR) if f.lower().endswith(".pdf")]
|
| 117 |
+
if existing:
|
| 118 |
+
return 0
|
| 119 |
+
|
| 120 |
+
urls = os.getenv("SEED_PDF_URLS", "").strip()
|
| 121 |
+
if not urls:
|
| 122 |
+
return 0
|
| 123 |
+
|
| 124 |
+
count = 0
|
| 125 |
+
for url in [u.strip() for u in urls.split(",") if u.strip()]:
|
| 126 |
+
try:
|
| 127 |
+
fname = os.path.basename(url.split("?")[0]) or "document.pdf"
|
| 128 |
+
dst = os.path.join(DOCS_DIR, fname)
|
| 129 |
+
r = requests.get(url, timeout=120)
|
| 130 |
+
r.raise_for_status()
|
| 131 |
+
with open(dst, "wb") as f:
|
| 132 |
+
f.write(r.content)
|
| 133 |
+
count += 1
|
| 134 |
+
print(f"[seed] Downloaded: {dst}")
|
| 135 |
+
except Exception as e:
|
| 136 |
+
print(f"[seed] Failed to download {url}: {e}")
|
| 137 |
+
return count
|
| 138 |
+
|
| 139 |
+
def ensure_data_ready_and_reset_index_if_changed():
|
| 140 |
+
"""
|
| 141 |
+
- Create data/
|
| 142 |
+
- Copy PDFs from repo root into data/ if missing there
|
| 143 |
+
- Optionally seed from URLs if data/ is empty
|
| 144 |
+
- If anything changed, delete nitda_db/ to force reindex
|
| 145 |
+
"""
|
| 146 |
+
os.makedirs(DOCS_DIR, exist_ok=True)
|
| 147 |
+
|
| 148 |
+
before = set(os.listdir(DOCS_DIR))
|
| 149 |
+
copied = 0
|
| 150 |
+
|
| 151 |
+
# Copy *.pdf from root into data/
|
| 152 |
+
for fname in os.listdir("."):
|
| 153 |
+
if fname.lower().endswith(".pdf"):
|
| 154 |
+
src = os.path.join(".", fname)
|
| 155 |
+
dst = os.path.join(DOCS_DIR, fname)
|
| 156 |
+
if not os.path.exists(dst):
|
| 157 |
+
try:
|
| 158 |
+
shutil.copy2(src, dst)
|
| 159 |
+
copied += 1
|
| 160 |
+
print(f"[init] Copied root PDF β {dst}")
|
| 161 |
+
except Exception as e:
|
| 162 |
+
print(f"[init] Could not copy {src} to {dst}: {e}")
|
| 163 |
+
|
| 164 |
+
seeded = seed_data_from_urls_if_empty()
|
| 165 |
+
|
| 166 |
+
after = set(os.listdir(DOCS_DIR))
|
| 167 |
+
changed = (copied > 0) or (seeded > 0) or (before != after)
|
| 168 |
+
|
| 169 |
+
if changed and os.path.isdir(DB_DIR):
|
| 170 |
+
try:
|
| 171 |
+
shutil.rmtree(DB_DIR)
|
| 172 |
+
print(f"[init] Removed old vector DB at {DB_DIR}/ (changed data/: {copied} copied, {seeded} seeded)")
|
| 173 |
+
except Exception as e:
|
| 174 |
+
print(f"[init] Could not remove {DB_DIR}/: {e}")
|
| 175 |
+
|
| 176 |
+
# Call once on import (top-level)
|
| 177 |
+
ensure_data_ready_and_reset_index_if_changed()
|
| 178 |
+
|
| 179 |
+
# -----------------------------
|
| 180 |
+
# Vector store builder/loader
|
| 181 |
+
# -----------------------------
|
| 182 |
def build_or_load_vectorstore():
|
| 183 |
"""Load existing Chroma DB if present; else build from PDFs in data/."""
|
| 184 |
+
# Use persisted DB if present
|
| 185 |
if os.path.isdir(DB_DIR) and os.listdir(DB_DIR):
|
| 186 |
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 187 |
return Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
|
| 188 |
|
| 189 |
pdfs = list_pdfs(DOCS_DIR)
|
| 190 |
if not pdfs:
|
| 191 |
+
raise FileNotFoundError(
|
| 192 |
+
f"No PDFs found in '{DOCS_DIR}'. Upload PDFs to the 'data/' folder, "
|
| 193 |
+
f"use the auto-copy (place PDFs in repo root), or set SEED_PDF_URLS."
|
| 194 |
+
)
|
| 195 |
|
| 196 |
+
# Load and chunk
|
| 197 |
docs = []
|
| 198 |
for p in pdfs:
|
| 199 |
loader = PyMuPDFLoader(p)
|
|
|
|
| 202 |
splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
|
| 203 |
chunks = splitter.split_documents(docs)
|
| 204 |
|
| 205 |
+
if not chunks:
|
| 206 |
+
raise ValueError("No text chunks were generated from the PDFs. Are the files readable?")
|
| 207 |
+
|
| 208 |
+
# Embed + persist
|
| 209 |
embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 210 |
vs = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_DIR)
|
| 211 |
vs.persist()
|
| 212 |
return vs
|
| 213 |
|
| 214 |
+
# -----------------------------
|
| 215 |
+
# LLM loader (with fallback)
|
| 216 |
+
# -----------------------------
|
| 217 |
def load_llm():
|
| 218 |
"""
|
| 219 |
Try to load primary (Mistral model). If it fails (OOM on CPU Space),
|
|
|
|
| 324 |
# Utils
|
| 325 |
numpy==1.26.4
|
| 326 |
pandas==2.1.4
|
| 327 |
+
requests==2.32.3
|
| 328 |
''').strip() + "\n"
|
| 329 |
|
| 330 |
RUNTIME_TXT = "python-3.10\n"
|
|
|
|
| 332 |
DATA_README = dedent(r'''
|
| 333 |
# Data folder
|
| 334 |
|
|
|
|
| 335 |
Place your NITDA PDFs here. Example filenames:
|
| 336 |
+
- NITDA-ACT-2007-2019-Edition1.pdf
|
| 337 |
+
- Digital-Literacy-Framework.pdf
|
| 338 |
+
- FrameworkAndGuidelinesForPublicInternetAccessPIA1.pdf
|
| 339 |
+
- NATIONAL-REGULATORY-GUIDELINE-FOR-ELECTRONIC-INVOICING-IN-NIGERIA-2025.pdf
|
| 340 |
+
''').strip() + "\n"
|
| 341 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
+
def write_project(project_dir: Path):
|
| 344 |
+
project_dir.mkdir(parents=True, exist_ok=True)
|
| 345 |
+
(project_dir / "app.py").write_text(APP_PY, encoding="utf-8")
|
| 346 |
+
(project_dir / "requirements.txt").write_text(REQUIREMENTS_TXT, encoding="utf-8")
|
| 347 |
+
(project_dir / "runtime.txt").write_text(RUNTIME_TXT, encoding="utf-8")
|
| 348 |
+
data_dir = project_dir / "data"
|
| 349 |
+
data_dir.mkdir(parents=True, exist_ok=True)
|
| 350 |
+
(data_dir / "README.md").write_text(DATA_README, encoding="utf-8")
|
| 351 |
+
print(f"β
Wrote project to: {project_dir.resolve()}")
|
| 352 |
+
for p in ["app.py", "requirements.txt", "runtime.txt", "data/README.md"]:
|
| 353 |
+
print(" -", project_dir / p)
|
| 354 |
+
|
| 355 |
+
def deploy_to_space(project_dir: Path, space_id: str, private: bool = False):
|
| 356 |
+
"""Deploy the folder to a Hugging Face Space (SDK: Gradio). Requires HF_TOKEN env var."""
|
| 357 |
+
from huggingface_hub import HfApi, create_repo, login
|
| 358 |
+
token = os.getenv("HF_TOKEN")
|
| 359 |
+
if not token:
|
| 360 |
+
raise RuntimeError("HF_TOKEN not set. Create a token at https://huggingface.co/settings/tokens and `export HF_TOKEN=...`")
|
| 361 |
+
login(token=token)
|
| 362 |
+
try:
|
| 363 |
+
create_repo(repo_id=space_id, repo_type="space", space_sdk="gradio", private=private)
|
| 364 |
+
print(f"π Created Space: {space_id}")
|
| 365 |
+
except Exception as e:
|
| 366 |
+
print(f"βΉοΈ Space exists or cannot be created: {e}")
|
| 367 |
+
api = HfApi()
|
| 368 |
+
api.upload_folder(
|
| 369 |
+
folder_path=str(project_dir),
|
| 370 |
+
repo_id=space_id,
|
| 371 |
+
repo_type="space",
|
| 372 |
+
commit_message="Deploy NITDA RAG",
|
| 373 |
+
ignore_patterns=[".git", "__pycache__", "*.ipynb_checkpoints*"],
|
| 374 |
+
)
|
| 375 |
+
print(f"β
Uploaded. Space: https://huggingface.co/spaces/{space_id}")
|
| 376 |
+
print(f" App URL: https://{space_id.replace('/', '-')}.hf.space")
|
| 377 |
+
|
| 378 |
+
def main():
|
| 379 |
+
parser = argparse.ArgumentParser(description="Create and optionally deploy a NITDA RAG app to Hugging Face Spaces.")
|
| 380 |
+
parser.add_argument("--project", required=True, help="Local project directory to create (e.g., nitda-rag)")
|
| 381 |
+
parser.add_argument("--space-id", help="Hugging Face Space ID (e.g., nwamgbowo/nitda-rag)")
|
| 382 |
+
parser.add_argument("--deploy", action="store_true", help="Upload the project to the specified Space")
|
| 383 |
+
parser.add_argument("--private", action="store_true", help="Create the Space as private (default: public)")
|
| 384 |
+
args = parser.parse_args()
|
| 385 |
+
|
| 386 |
+
project_dir = Path(args.project).resolve()
|
| 387 |
+
write_project(project_dir)
|
| 388 |
+
|
| 389 |
+
if args.deploy:
|
| 390 |
+
if not args.space_id:
|
| 391 |
+
print("β --deploy requires --space-id (e.g., --space-id nwamgbowo/nitda-rag)")
|
| 392 |
+
sys.exit(2)
|
| 393 |
+
deploy_to_space(project_dir, args.space_id, private=args.private)
|
| 394 |
+
print("\nπ After the Space is Running:")
|
| 395 |
+
print(" 1) Upload PDFs to the data/ folder (or rely on auto-copy from root / URL seeding).")
|
| 396 |
+
print(" 2) Click 'Initialize (build index + load model)'.")
|
| 397 |
+
print(" 3) Ask questions.")
|
| 398 |
+
print("\nπ‘ CPU Space tip: If Mistral fails to load, set Space Variable USE_TINYLLAMA=1 to force TinyLlama.\n")
|
| 399 |
+
else:
|
| 400 |
+
print("\nπ To run locally:")
|
| 401 |
+
print(f" cd {project_dir}")
|
| 402 |
+
print(" pip install -r requirements.txt")
|
| 403 |
+
print(" python app.py")
|
| 404 |
+
print("\nπ Then open http://localhost:7860 and click 'Initialize (build index + load model)'.")
|
| 405 |
+
print("π Put your PDFs under the data/ folder (or in repo root; auto-copy will handle it).")
|
| 406 |
|
| 407 |
+
if __name__ == "__main__":
|
| 408 |
+
main()
|