Spaces:

nwamgbowo
/

RAG-Assistant

Sleeping

App Files Files Community

nwamgbowo commited on Mar 8

Commit

8c26925

verified ·

1 Parent(s): 94f335b

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +160 -11

src/streamlit_app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
@@ -32,6 +33,7 @@ from textwrap import dedent
 APP_PY = dedent(r'''
 import os
 import time
 import traceback
 from typing import List
@@ -45,6 +47,7 @@ from langchain_community.vectorstores import Chroma
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 # -----------------------------
 # Config
@@ -98,22 +101,99 @@ QNA_TEMPLATE = """[SYSTEM]
 """
 # -----------------------------
-# Helpers
 # -----------------------------
 def list_pdfs(folder: str):
     os.makedirs(folder, exist_ok=True)
     return [os.path.join(folder, f) for f in os.listdir(folder) if f.lower().endswith(".pdf")]
 def build_or_load_vectorstore():
     """Load existing Chroma DB if present; else build from PDFs in data/."""
     if os.path.isdir(DB_DIR) and os.listdir(DB_DIR):
         embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
         return Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
     pdfs = list_pdfs(DOCS_DIR)
     if not pdfs:
-        raise FileNotFoundError(f"No PDFs found in '{DOCS_DIR}'. Upload your PDFs to the 'data/' folder.")
     docs = []
     for p in pdfs:
         loader = PyMuPDFLoader(p)
@@ -122,11 +202,18 @@ def build_or_load_vectorstore():
     splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
     chunks = splitter.split_documents(docs)
     embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     vs = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_DIR)
     vs.persist()
     return vs
 def load_llm():
     """
     Try to load primary (Mistral model). If it fails (OOM on CPU Space),
@@ -237,6 +324,7 @@ pymupdf==1.23.26
 # Utils
 numpy==1.26.4
 pandas==2.1.4
 ''').strip() + "\n"
 RUNTIME_TXT = "python-3.10\n"
@@ -244,16 +332,77 @@ RUNTIME_TXT = "python-3.10\n"
 DATA_README = dedent(r'''
 # Data folder
 Place your NITDA PDFs here. Example filenames:
-python build_and_deploy_nitda_rag.py \
-  --space-id nwamgbowo/nitda-rag \
-  --pdf "/path/to/NITDA-ACT-2007-2019-Edition1.pdf" \
-  --pdf "/path/to/Digital-Literacy-Framework.pdf" \
-  --pdf "/path/to/FrameworkAndGuidelinesForPublicInternetAccessPIA1.pdf" \
-  --pdf "/path/to/NATIONAL-REGULATORY-GUIDELINE-FOR-ELECTRONIC-INVOICING-IN-NIGERIA-2025.pdf"
-''').strip() + "\n"
-    ))

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 APP_PY = dedent(r'''
 import os
 import time
+import shutil
 import traceback
 from typing import List
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
+import requests
 # -----------------------------
 # Config
 """
 # -----------------------------
+# Auto-copy & seeding (STARTUP)
 # -----------------------------
 def list_pdfs(folder: str):
     os.makedirs(folder, exist_ok=True)
     return [os.path.join(folder, f) for f in os.listdir(folder) if f.lower().endswith(".pdf")]
+def seed_data_from_urls_if_empty():
+    """
+    If data/ has no PDFs and SEED_PDF_URLS is set (comma-separated URLs),
+    download those PDFs into data/.
+    """
+    os.makedirs(DOCS_DIR, exist_ok=True)
+    existing = [f for f in os.listdir(DOCS_DIR) if f.lower().endswith(".pdf")]
+    if existing:
+        return 0
+    urls = os.getenv("SEED_PDF_URLS", "").strip()
+    if not urls:
+        return 0
+    count = 0
+    for url in [u.strip() for u in urls.split(",") if u.strip()]:
+        try:
+            fname = os.path.basename(url.split("?")[0]) or "document.pdf"
+            dst = os.path.join(DOCS_DIR, fname)
+            r = requests.get(url, timeout=120)
+            r.raise_for_status()
+            with open(dst, "wb") as f:
+                f.write(r.content)
+            count += 1
+            print(f"[seed] Downloaded: {dst}")
+        except Exception as e:
+            print(f"[seed] Failed to download {url}: {e}")
+    return count
+def ensure_data_ready_and_reset_index_if_changed():
+    """
+    - Create data/
+    - Copy PDFs from repo root into data/ if missing there
+    - Optionally seed from URLs if data/ is empty
+    - If anything changed, delete nitda_db/ to force reindex
+    """
+    os.makedirs(DOCS_DIR, exist_ok=True)
+    before = set(os.listdir(DOCS_DIR))
+    copied = 0
+    # Copy *.pdf from root into data/
+    for fname in os.listdir("."):
+        if fname.lower().endswith(".pdf"):
+            src = os.path.join(".", fname)
+            dst = os.path.join(DOCS_DIR, fname)
+            if not os.path.exists(dst):
+                try:
+                    shutil.copy2(src, dst)
+                    copied += 1
+                    print(f"[init] Copied root PDF → {dst}")
+                except Exception as e:
+                    print(f"[init] Could not copy {src} to {dst}: {e}")
+    seeded = seed_data_from_urls_if_empty()
+    after = set(os.listdir(DOCS_DIR))
+    changed = (copied > 0) or (seeded > 0) or (before != after)
+    if changed and os.path.isdir(DB_DIR):
+        try:
+            shutil.rmtree(DB_DIR)
+            print(f"[init] Removed old vector DB at {DB_DIR}/ (changed data/: {copied} copied, {seeded} seeded)")
+        except Exception as e:
+            print(f"[init] Could not remove {DB_DIR}/: {e}")
+# Call once on import (top-level)
+ensure_data_ready_and_reset_index_if_changed()
+# -----------------------------
+# Vector store builder/loader
+# -----------------------------
 def build_or_load_vectorstore():
     """Load existing Chroma DB if present; else build from PDFs in data/."""
+    # Use persisted DB if present
     if os.path.isdir(DB_DIR) and os.listdir(DB_DIR):
         embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
         return Chroma(persist_directory=DB_DIR, embedding_function=embeddings)
     pdfs = list_pdfs(DOCS_DIR)
     if not pdfs:
+        raise FileNotFoundError(
+            f"No PDFs found in '{DOCS_DIR}'. Upload PDFs to the 'data/' folder, "
+            f"use the auto-copy (place PDFs in repo root), or set SEED_PDF_URLS."
+        )
+    # Load and chunk
     docs = []
     for p in pdfs:
         loader = PyMuPDFLoader(p)
     splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
     chunks = splitter.split_documents(docs)
+    if not chunks:
+        raise ValueError("No text chunks were generated from the PDFs. Are the files readable?")
+    # Embed + persist
     embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     vs = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=DB_DIR)
     vs.persist()
     return vs
+# -----------------------------
+# LLM loader (with fallback)
+# -----------------------------
 def load_llm():
     """
     Try to load primary (Mistral model). If it fails (OOM on CPU Space),
 # Utils
 numpy==1.26.4
 pandas==2.1.4
+requests==2.32.3
 ''').strip() + "\n"
 RUNTIME_TXT = "python-3.10\n"
 DATA_README = dedent(r'''
 # Data folder
 Place your NITDA PDFs here. Example filenames:
+- NITDA-ACT-2007-2019-Edition1.pdf
+- Digital-Literacy-Framework.pdf
+- FrameworkAndGuidelinesForPublicInternetAccessPIA1.pdf
+- NATIONAL-REGULATORY-GUIDELINE-FOR-ELECTRONIC-INVOICING-IN-NIGERIA-2025.pdf
+''').strip() + "\n"
+def write_project(project_dir: Path):
+    project_dir.mkdir(parents=True, exist_ok=True)
+    (project_dir / "app.py").write_text(APP_PY, encoding="utf-8")
+    (project_dir / "requirements.txt").write_text(REQUIREMENTS_TXT, encoding="utf-8")
+    (project_dir / "runtime.txt").write_text(RUNTIME_TXT, encoding="utf-8")
+    data_dir = project_dir / "data"
+    data_dir.mkdir(parents=True, exist_ok=True)
+    (data_dir / "README.md").write_text(DATA_README, encoding="utf-8")
+    print(f"✅ Wrote project to: {project_dir.resolve()}")
+    for p in ["app.py", "requirements.txt", "runtime.txt", "data/README.md"]:
+        print("   -", project_dir / p)
+def deploy_to_space(project_dir: Path, space_id: str, private: bool = False):
+    """Deploy the folder to a Hugging Face Space (SDK: Gradio). Requires HF_TOKEN env var."""
+    from huggingface_hub import HfApi, create_repo, login
+    token = os.getenv("HF_TOKEN")
+    if not token:
+        raise RuntimeError("HF_TOKEN not set. Create a token at https://huggingface.co/settings/tokens and `export HF_TOKEN=...`")
+    login(token=token)
+    try:
+        create_repo(repo_id=space_id, repo_type="space", space_sdk="gradio", private=private)
+        print(f"🆕 Created Space: {space_id}")
+    except Exception as e:
+        print(f"ℹ️ Space exists or cannot be created: {e}")
+    api = HfApi()
+    api.upload_folder(
+        folder_path=str(project_dir),
+        repo_id=space_id,
+        repo_type="space",
+        commit_message="Deploy NITDA RAG",
+        ignore_patterns=[".git", "__pycache__", "*.ipynb_checkpoints*"],
+    )
+    print(f"✅ Uploaded. Space: https://huggingface.co/spaces/{space_id}")
+    print(f"   App URL: https://{space_id.replace('/', '-')}.hf.space")
+def main():
+    parser = argparse.ArgumentParser(description="Create and optionally deploy a NITDA RAG app to Hugging Face Spaces.")
+    parser.add_argument("--project", required=True, help="Local project directory to create (e.g., nitda-rag)")
+    parser.add_argument("--space-id", help="Hugging Face Space ID (e.g., nwamgbowo/nitda-rag)")
+    parser.add_argument("--deploy", action="store_true", help="Upload the project to the specified Space")
+    parser.add_argument("--private", action="store_true", help="Create the Space as private (default: public)")
+    args = parser.parse_args()
+    project_dir = Path(args.project).resolve()
+    write_project(project_dir)
+    if args.deploy:
+        if not args.space_id:
+            print("❌ --deploy requires --space-id (e.g., --space-id nwamgbowo/nitda-rag)")
+            sys.exit(2)
+        deploy_to_space(project_dir, args.space_id, private=args.private)
+        print("\n🔔 After the Space is Running:")
+        print("   1) Upload PDFs to the data/ folder (or rely on auto-copy from root / URL seeding).")
+        print("   2) Click 'Initialize (build index + load model)'.")
+        print("   3) Ask questions.")
+        print("\n💡 CPU Space tip: If Mistral fails to load, set Space Variable USE_TINYLLAMA=1 to force TinyLlama.\n")
+    else:
+        print("\n🚀 To run locally:")
+        print(f"   cd {project_dir}")
+        print("   pip install -r requirements.txt")
+        print("   python app.py")
+        print("\n📌 Then open http://localhost:7860 and click 'Initialize (build index + load model)'.")
+        print("📂 Put your PDFs under the data/ folder (or in repo root; auto-copy will handle it).")
+if __name__ == "__main__":
+    main()