Spaces:
Running
Running
github-actions[bot] commited on
Commit ·
9594951
1
Parent(s): b5cb5bb
🚀 Auto-deploy backend from GitHub (ce99ac1)
Browse files
scripts/download_vectorstore_from_firebase.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Download vectorstore directory from Firebase Storage at container startup.
|
| 3 |
+
Run: python -m backend.scripts.download_vectorstore_from_firebase
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger("mathpulse.download_vectorstore")
|
| 14 |
+
|
| 15 |
+
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
|
| 16 |
+
|
| 17 |
+
from backend.rag.firebase_storage_loader import _init_firebase_storage
|
| 18 |
+
|
| 19 |
+
REMOTE_PREFIX = "vectorstore/"
|
| 20 |
+
LOCAL_DEST_DIR = Path("/app/datasets/vectorstore")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def download_vectorstore(dest_dir: Path, prefix: str = REMOTE_PREFIX):
|
| 24 |
+
"""Download all files under a prefix from Firebase Storage, preserving structure."""
|
| 25 |
+
_, bucket = _init_firebase_storage()
|
| 26 |
+
if bucket is None:
|
| 27 |
+
logger.warning("Firebase Storage not available, vectorstore download skipped")
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
| 31 |
+
|
| 32 |
+
blobs = list(bucket.list_blobs(prefix=prefix))
|
| 33 |
+
if not blobs:
|
| 34 |
+
logger.warning("No blobs found under prefix: %s", prefix)
|
| 35 |
+
return False
|
| 36 |
+
|
| 37 |
+
downloaded = 0
|
| 38 |
+
errors = 0
|
| 39 |
+
|
| 40 |
+
for blob in blobs:
|
| 41 |
+
relative_path = blob.name[len(prefix):].lstrip("/")
|
| 42 |
+
if not relative_path:
|
| 43 |
+
continue
|
| 44 |
+
|
| 45 |
+
local_path = dest_dir / relative_path
|
| 46 |
+
local_path.parent.mkdir(parents=True, exist_ok=True)
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
blob.download_to_filename(str(local_path))
|
| 50 |
+
logger.info("Downloaded: %s (%d bytes)", blob.name, blob.size or 0)
|
| 51 |
+
downloaded += 1
|
| 52 |
+
except Exception as e:
|
| 53 |
+
logger.error("Failed to download %s: %s", blob.name, e)
|
| 54 |
+
errors += 1
|
| 55 |
+
|
| 56 |
+
logger.info("Download complete: %d files downloaded, %d errors", downloaded, errors)
|
| 57 |
+
return errors == 0
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
if __name__ == "__main__":
|
| 61 |
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
| 62 |
+
download_vectorstore(LOCAL_DEST_DIR, REMOTE_PREFIX)
|
scripts/upload_vectorstore_to_firebase.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Upload vectorstore directory to Firebase Storage.
|
| 3 |
+
Run: python -m backend.scripts.upload_vectorstore_to_firebase
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import logging
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger("mathpulse.upload_vectorstore")
|
| 14 |
+
|
| 15 |
+
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
|
| 16 |
+
|
| 17 |
+
from backend.rag.firebase_storage_loader import _init_firebase_storage
|
| 18 |
+
|
| 19 |
+
VECTORSTORE_SOURCE_DIR = Path(__file__).resolve().parents[3] / "datasets" / "vectorstore"
|
| 20 |
+
REMOTE_PREFIX = "vectorstore/"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def upload_directory(local_dir: Path, bucket, prefix: str):
|
| 24 |
+
"""Recursively upload a local directory to Firebase Storage prefix."""
|
| 25 |
+
uploaded = 0
|
| 26 |
+
skipped = 0
|
| 27 |
+
|
| 28 |
+
for root, dirs, files in os.walk(local_dir):
|
| 29 |
+
for filename in files:
|
| 30 |
+
local_path = Path(root) / filename
|
| 31 |
+
relative_path = local_path.relative_to(local_dir)
|
| 32 |
+
remote_path = f"{prefix}{relative_path.as_posix()}"
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
blob = bucket.blob(remote_path)
|
| 36 |
+
blob.upload_from_filename(str(local_path))
|
| 37 |
+
logger.info("Uploaded: %s (%d bytes)", remote_path, local_path.stat().st_size)
|
| 38 |
+
uploaded += 1
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.error("Failed to upload %s: %s", remote_path, e)
|
| 41 |
+
skipped += 1
|
| 42 |
+
|
| 43 |
+
return uploaded, skipped
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
import argparse
|
| 48 |
+
|
| 49 |
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
| 50 |
+
|
| 51 |
+
parser = argparse.ArgumentParser(description="Upload vectorstore to Firebase Storage")
|
| 52 |
+
parser.add_argument("--source", type=str, default=str(VECTORSTORE_SOURCE_DIR),
|
| 53 |
+
help="Local vectorstore directory")
|
| 54 |
+
parser.add_argument("--prefix", type=str, default=REMOTE_PREFIX,
|
| 55 |
+
help="Remote path prefix in Firebase Storage")
|
| 56 |
+
args = parser.parse_args()
|
| 57 |
+
|
| 58 |
+
source_dir = Path(args.source)
|
| 59 |
+
if not source_dir.exists():
|
| 60 |
+
logger.error("Source directory does not exist: %s", source_dir)
|
| 61 |
+
sys.exit(1)
|
| 62 |
+
|
| 63 |
+
_, bucket = _init_firebase_storage()
|
| 64 |
+
if bucket is None:
|
| 65 |
+
logger.error("Firebase Storage not available")
|
| 66 |
+
sys.exit(1)
|
| 67 |
+
|
| 68 |
+
logger.info("Uploading vectorstore from %s to gs://%s/%s",
|
| 69 |
+
source_dir, bucket.name, args.prefix)
|
| 70 |
+
uploaded, skipped = upload_directory(source_dir, bucket, args.prefix)
|
| 71 |
+
logger.info("Upload complete: %d uploaded, %d skipped", uploaded, skipped)
|