nwamgbowo commited on
Commit
8c26925
Β·
verified Β·
1 Parent(s): 94f335b

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +160 -11
src/streamlit_app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
 
@@ -32,6 +33,7 @@ from textwrap import dedent
32
  APP_PY = dedent(r'''
33
  import os
34
  import time
 
35
  import traceback
36
  from typing import List
37
 
@@ -45,6 +47,7 @@ from langchain_community.vectorstores import Chroma
45
 
46
  from huggingface_hub import hf_hub_download
47
  from llama_cpp import Llama
 
48
 
49
  # -----------------------------
50
  # Config
@@ -98,22 +101,99 @@ QNA_TEMPLATE = """[SYSTEM]
98
  """
99
 
100
  # -----------------------------
101
- # Helpers
102
  # -----------------------------
103
  def list_pdfs(folder: str):
104
  os.makedirs(folder, exist_ok=True)
105
  return [os.path.join(folder, f) for f in os.listdir(folder) if f.lower().endswith(".pdf")]
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  def build_or_load_vectorstore():
108
  """Load existing Chroma DB if present; else build from PDFs in data/."""
 
109
  if os.path.isdir(DB_DIR) and os.listdir(DB_DIR):
110
  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
111
  return Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
112
 
113
  pdfs = list_pdfs(DOCS_DIR)
114
  if not pdfs:
115
- raise FileNotFoundError(f"No PDFs found in '{DOCS_DIR}'. Upload your PDFs to the 'data/' folder.")
 
 
 
116
 
 
117
  docs = []
118
  for p in pdfs:
119
  loader = PyMuPDFLoader(p)
@@ -122,11 +202,18 @@ def build_or_load_vectorstore():
122
  splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
123
  chunks = splitter.split_documents(docs)
124
 
 
 
 
 
125
  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
126
  vs = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_DIR)
127
  vs.persist()
128
  return vs
129
 
 
 
 
130
  def load_llm():
131
  """
132
  Try to load primary (Mistral model). If it fails (OOM on CPU Space),
@@ -237,6 +324,7 @@ pymupdf==1.23.26
237
  # Utils
238
  numpy==1.26.4
239
  pandas==2.1.4
 
240
  ''').strip() + "\n"
241
 
242
  RUNTIME_TXT = "python-3.10\n"
@@ -244,16 +332,77 @@ RUNTIME_TXT = "python-3.10\n"
244
  DATA_README = dedent(r'''
245
  # Data folder
246
 
247
-
248
  Place your NITDA PDFs here. Example filenames:
 
 
 
 
 
249
 
250
- python build_and_deploy_nitda_rag.py \
251
- --space-id nwamgbowo/nitda-rag \
252
- --pdf "/path/to/NITDA-ACT-2007-2019-Edition1.pdf" \
253
- --pdf "/path/to/Digital-Literacy-Framework.pdf" \
254
- --pdf "/path/to/FrameworkAndGuidelinesForPublicInternetAccessPIA1.pdf" \
255
- --pdf "/path/to/NATIONAL-REGULATORY-GUIDELINE-FOR-ELECTRONIC-INVOICING-IN-NIGERIA-2025.pdf"
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
- ''').strip() + "\n"
259
- ))
 
1
+
2
  #!/usr/bin/env python3
3
  # -*- coding: utf-8 -*-
4
 
 
33
  APP_PY = dedent(r'''
34
  import os
35
  import time
36
+ import shutil
37
  import traceback
38
  from typing import List
39
 
 
47
 
48
  from huggingface_hub import hf_hub_download
49
  from llama_cpp import Llama
50
+ import requests
51
 
52
  # -----------------------------
53
  # Config
 
101
  """
102
 
103
  # -----------------------------
104
+ # Auto-copy & seeding (STARTUP)
105
  # -----------------------------
106
  def list_pdfs(folder: str):
107
  os.makedirs(folder, exist_ok=True)
108
  return [os.path.join(folder, f) for f in os.listdir(folder) if f.lower().endswith(".pdf")]
109
 
110
+ def seed_data_from_urls_if_empty():
111
+ """
112
+ If data/ has no PDFs and SEED_PDF_URLS is set (comma-separated URLs),
113
+ download those PDFs into data/.
114
+ """
115
+ os.makedirs(DOCS_DIR, exist_ok=True)
116
+ existing = [f for f in os.listdir(DOCS_DIR) if f.lower().endswith(".pdf")]
117
+ if existing:
118
+ return 0
119
+
120
+ urls = os.getenv("SEED_PDF_URLS", "").strip()
121
+ if not urls:
122
+ return 0
123
+
124
+ count = 0
125
+ for url in [u.strip() for u in urls.split(",") if u.strip()]:
126
+ try:
127
+ fname = os.path.basename(url.split("?")[0]) or "document.pdf"
128
+ dst = os.path.join(DOCS_DIR, fname)
129
+ r = requests.get(url, timeout=120)
130
+ r.raise_for_status()
131
+ with open(dst, "wb") as f:
132
+ f.write(r.content)
133
+ count += 1
134
+ print(f"[seed] Downloaded: {dst}")
135
+ except Exception as e:
136
+ print(f"[seed] Failed to download {url}: {e}")
137
+ return count
138
+
139
+ def ensure_data_ready_and_reset_index_if_changed():
140
+ """
141
+ - Create data/
142
+ - Copy PDFs from repo root into data/ if missing there
143
+ - Optionally seed from URLs if data/ is empty
144
+ - If anything changed, delete nitda_db/ to force reindex
145
+ """
146
+ os.makedirs(DOCS_DIR, exist_ok=True)
147
+
148
+ before = set(os.listdir(DOCS_DIR))
149
+ copied = 0
150
+
151
+ # Copy *.pdf from root into data/
152
+ for fname in os.listdir("."):
153
+ if fname.lower().endswith(".pdf"):
154
+ src = os.path.join(".", fname)
155
+ dst = os.path.join(DOCS_DIR, fname)
156
+ if not os.path.exists(dst):
157
+ try:
158
+ shutil.copy2(src, dst)
159
+ copied += 1
160
+ print(f"[init] Copied root PDF β†’ {dst}")
161
+ except Exception as e:
162
+ print(f"[init] Could not copy {src} to {dst}: {e}")
163
+
164
+ seeded = seed_data_from_urls_if_empty()
165
+
166
+ after = set(os.listdir(DOCS_DIR))
167
+ changed = (copied > 0) or (seeded > 0) or (before != after)
168
+
169
+ if changed and os.path.isdir(DB_DIR):
170
+ try:
171
+ shutil.rmtree(DB_DIR)
172
+ print(f"[init] Removed old vector DB at {DB_DIR}/ (changed data/: {copied} copied, {seeded} seeded)")
173
+ except Exception as e:
174
+ print(f"[init] Could not remove {DB_DIR}/: {e}")
175
+
176
+ # Call once on import (top-level)
177
+ ensure_data_ready_and_reset_index_if_changed()
178
+
179
+ # -----------------------------
180
+ # Vector store builder/loader
181
+ # -----------------------------
182
  def build_or_load_vectorstore():
183
  """Load existing Chroma DB if present; else build from PDFs in data/."""
184
+ # Use persisted DB if present
185
  if os.path.isdir(DB_DIR) and os.listdir(DB_DIR):
186
  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
187
  return Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
188
 
189
  pdfs = list_pdfs(DOCS_DIR)
190
  if not pdfs:
191
+ raise FileNotFoundError(
192
+ f"No PDFs found in '{DOCS_DIR}'. Upload PDFs to the 'data/' folder, "
193
+ f"use the auto-copy (place PDFs in repo root), or set SEED_PDF_URLS."
194
+ )
195
 
196
+ # Load and chunk
197
  docs = []
198
  for p in pdfs:
199
  loader = PyMuPDFLoader(p)
 
202
  splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
203
  chunks = splitter.split_documents(docs)
204
 
205
+ if not chunks:
206
+ raise ValueError("No text chunks were generated from the PDFs. Are the files readable?")
207
+
208
+ # Embed + persist
209
  embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
210
  vs = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_DIR)
211
  vs.persist()
212
  return vs
213
 
214
+ # -----------------------------
215
+ # LLM loader (with fallback)
216
+ # -----------------------------
217
  def load_llm():
218
  """
219
  Try to load primary (Mistral model). If it fails (OOM on CPU Space),
 
324
  # Utils
325
  numpy==1.26.4
326
  pandas==2.1.4
327
+ requests==2.32.3
328
  ''').strip() + "\n"
329
 
330
  RUNTIME_TXT = "python-3.10\n"
 
332
  DATA_README = dedent(r'''
333
  # Data folder
334
 
 
335
  Place your NITDA PDFs here. Example filenames:
336
+ - NITDA-ACT-2007-2019-Edition1.pdf
337
+ - Digital-Literacy-Framework.pdf
338
+ - FrameworkAndGuidelinesForPublicInternetAccessPIA1.pdf
339
+ - NATIONAL-REGULATORY-GUIDELINE-FOR-ELECTRONIC-INVOICING-IN-NIGERIA-2025.pdf
340
+ ''').strip() + "\n"
341
 
 
 
 
 
 
 
342
 
343
+ def write_project(project_dir: Path):
344
+ project_dir.mkdir(parents=True, exist_ok=True)
345
+ (project_dir / "app.py").write_text(APP_PY, encoding="utf-8")
346
+ (project_dir / "requirements.txt").write_text(REQUIREMENTS_TXT, encoding="utf-8")
347
+ (project_dir / "runtime.txt").write_text(RUNTIME_TXT, encoding="utf-8")
348
+ data_dir = project_dir / "data"
349
+ data_dir.mkdir(parents=True, exist_ok=True)
350
+ (data_dir / "README.md").write_text(DATA_README, encoding="utf-8")
351
+ print(f"βœ… Wrote project to: {project_dir.resolve()}")
352
+ for p in ["app.py", "requirements.txt", "runtime.txt", "data/README.md"]:
353
+ print(" -", project_dir / p)
354
+
355
+ def deploy_to_space(project_dir: Path, space_id: str, private: bool = False):
356
+ """Deploy the folder to a Hugging Face Space (SDK: Gradio). Requires HF_TOKEN env var."""
357
+ from huggingface_hub import HfApi, create_repo, login
358
+ token = os.getenv("HF_TOKEN")
359
+ if not token:
360
+ raise RuntimeError("HF_TOKEN not set. Create a token at https://huggingface.co/settings/tokens and `export HF_TOKEN=...`")
361
+ login(token=token)
362
+ try:
363
+ create_repo(repo_id=space_id, repo_type="space", space_sdk="gradio", private=private)
364
+ print(f"πŸ†• Created Space: {space_id}")
365
+ except Exception as e:
366
+ print(f"ℹ️ Space exists or cannot be created: {e}")
367
+ api = HfApi()
368
+ api.upload_folder(
369
+ folder_path=str(project_dir),
370
+ repo_id=space_id,
371
+ repo_type="space",
372
+ commit_message="Deploy NITDA RAG",
373
+ ignore_patterns=[".git", "__pycache__", "*.ipynb_checkpoints*"],
374
+ )
375
+ print(f"βœ… Uploaded. Space: https://huggingface.co/spaces/{space_id}")
376
+ print(f" App URL: https://{space_id.replace('/', '-')}.hf.space")
377
+
378
+ def main():
379
+ parser = argparse.ArgumentParser(description="Create and optionally deploy a NITDA RAG app to Hugging Face Spaces.")
380
+ parser.add_argument("--project", required=True, help="Local project directory to create (e.g., nitda-rag)")
381
+ parser.add_argument("--space-id", help="Hugging Face Space ID (e.g., nwamgbowo/nitda-rag)")
382
+ parser.add_argument("--deploy", action="store_true", help="Upload the project to the specified Space")
383
+ parser.add_argument("--private", action="store_true", help="Create the Space as private (default: public)")
384
+ args = parser.parse_args()
385
+
386
+ project_dir = Path(args.project).resolve()
387
+ write_project(project_dir)
388
+
389
+ if args.deploy:
390
+ if not args.space_id:
391
+ print("❌ --deploy requires --space-id (e.g., --space-id nwamgbowo/nitda-rag)")
392
+ sys.exit(2)
393
+ deploy_to_space(project_dir, args.space_id, private=args.private)
394
+ print("\nπŸ”” After the Space is Running:")
395
+ print(" 1) Upload PDFs to the data/ folder (or rely on auto-copy from root / URL seeding).")
396
+ print(" 2) Click 'Initialize (build index + load model)'.")
397
+ print(" 3) Ask questions.")
398
+ print("\nπŸ’‘ CPU Space tip: If Mistral fails to load, set Space Variable USE_TINYLLAMA=1 to force TinyLlama.\n")
399
+ else:
400
+ print("\nπŸš€ To run locally:")
401
+ print(f" cd {project_dir}")
402
+ print(" pip install -r requirements.txt")
403
+ print(" python app.py")
404
+ print("\nπŸ“Œ Then open http://localhost:7860 and click 'Initialize (build index + load model)'.")
405
+ print("πŸ“‚ Put your PDFs under the data/ folder (or in repo root; auto-copy will handle it).")
406
 
407
+ if __name__ == "__main__":
408
+ main()