Spaces:
Runtime error
Runtime error
Commit Β·
547fc39
1
Parent(s): 1a09a45
π§ Add LFS data download script for HF Spaces deployment
Browse files- Created download_lfs_data.py to fetch 11,555 documents from LFS
- Added startup.sh to download data before app starts
- Added huggingface-hub==0.26.5 for snapshot_download
- Modified Dockerfile to use startup script
- Fixes: Dataset not found errors in production
- Dockerfile +6 -3
- download_lfs_data.py +53 -0
- requirements.txt +1 -0
- startup.sh +13 -0
Dockerfile
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
# Use Python 3.12 as specified
|
| 2 |
FROM python:3.12-slim
|
| 3 |
|
| 4 |
-
# Install minimal system dependencies
|
| 5 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
curl \
|
| 7 |
&& rm -rf /var/lib/apt/lists/*
|
|
@@ -24,6 +24,9 @@ RUN pip install --no-cache-dir --upgrade pip && \
|
|
| 24 |
# Copy the application code
|
| 25 |
COPY --chown=user . /app
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
# Expose the port that HF Spaces requires
|
| 28 |
EXPOSE 7860
|
| 29 |
|
|
@@ -35,5 +38,5 @@ ENV PYTHONUNBUFFERED=1
|
|
| 35 |
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
| 36 |
CMD curl -f http://localhost:7860/health || exit 1
|
| 37 |
|
| 38 |
-
# Run the application
|
| 39 |
-
CMD ["
|
|
|
|
| 1 |
# Use Python 3.12 as specified
|
| 2 |
FROM python:3.12-slim
|
| 3 |
|
| 4 |
+
# Install minimal system dependencies including git-lfs for database files
|
| 5 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
curl \
|
| 7 |
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
| 24 |
# Copy the application code
|
| 25 |
COPY --chown=user . /app
|
| 26 |
|
| 27 |
+
# Make startup script executable
|
| 28 |
+
RUN chmod +x startup.sh
|
| 29 |
+
|
| 30 |
# Expose the port that HF Spaces requires
|
| 31 |
EXPOSE 7860
|
| 32 |
|
|
|
|
| 38 |
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
| 39 |
CMD curl -f http://localhost:7860/health || exit 1
|
| 40 |
|
| 41 |
+
# Run the application with LFS data download
|
| 42 |
+
CMD ["./startup.sh"]
|
download_lfs_data.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Download LanceDB data from Hugging Face LFS storage at startup.
|
| 4 |
+
This script downloads the Git LFS files that contain the 11,555 document database.
|
| 5 |
+
"""
|
| 6 |
+
import os
|
| 7 |
+
import subprocess
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
def download_lfs_files():
|
| 11 |
+
"""Download Git LFS files from the Hugging Face Space repository."""
|
| 12 |
+
print("π½ Checking for LanceDB data...")
|
| 13 |
+
|
| 14 |
+
lancedb_path = Path("lancedb_data")
|
| 15 |
+
|
| 16 |
+
# Check if database already exists and has data
|
| 17 |
+
if lancedb_path.exists():
|
| 18 |
+
rajasthan_docs = lancedb_path / "rajasthan_documents.lance"
|
| 19 |
+
if rajasthan_docs.exists():
|
| 20 |
+
# Check if it's a real file (not LFS pointer)
|
| 21 |
+
version_file = rajasthan_docs / "_versions"
|
| 22 |
+
if version_file.exists():
|
| 23 |
+
print("β
LanceDB data already downloaded")
|
| 24 |
+
return True
|
| 25 |
+
else:
|
| 26 |
+
print("β οΈ LanceDB data is LFS pointer - downloading actual files...")
|
| 27 |
+
|
| 28 |
+
# Try to download LFS files using huggingface_hub
|
| 29 |
+
try:
|
| 30 |
+
from huggingface_hub import snapshot_download
|
| 31 |
+
|
| 32 |
+
space_id = "ChAbhishek28/PensionBot"
|
| 33 |
+
print(f"π₯ Downloading LFS files from {space_id}...")
|
| 34 |
+
|
| 35 |
+
# Download only the lancedb_data folder
|
| 36 |
+
snapshot_download(
|
| 37 |
+
repo_id=space_id,
|
| 38 |
+
repo_type="space",
|
| 39 |
+
allow_patterns="lancedb_data/**",
|
| 40 |
+
local_dir=".",
|
| 41 |
+
local_dir_use_symlinks=False,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
print("β
LFS files downloaded successfully")
|
| 45 |
+
return True
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"β Failed to download LFS files: {e}")
|
| 49 |
+
print("β οΈ Voice Bot will start with minimal documents")
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
download_lfs_files()
|
requirements.txt
CHANGED
|
@@ -30,6 +30,7 @@ langchain-google-genai==2.0.7
|
|
| 30 |
# Vector Database & Embeddings
|
| 31 |
lancedb==0.13.0
|
| 32 |
sentence-transformers==3.3.1
|
|
|
|
| 33 |
einops==0.8.0
|
| 34 |
|
| 35 |
# Data Processing
|
|
|
|
| 30 |
# Vector Database & Embeddings
|
| 31 |
lancedb==0.13.0
|
| 32 |
sentence-transformers==3.3.1
|
| 33 |
+
huggingface-hub==0.26.5
|
| 34 |
einops==0.8.0
|
| 35 |
|
| 36 |
# Data Processing
|
startup.sh
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Startup script for Hugging Face Spaces deployment
|
| 3 |
+
# Downloads LFS database files and starts the application
|
| 4 |
+
|
| 5 |
+
echo "π Voice Bot Startup Script"
|
| 6 |
+
|
| 7 |
+
# Download LFS files (11,555 documents database)
|
| 8 |
+
echo "π₯ Downloading LanceDB data..."
|
| 9 |
+
python3 download_lfs_data.py
|
| 10 |
+
|
| 11 |
+
# Start the application
|
| 12 |
+
echo "π€ Starting Voice Bot..."
|
| 13 |
+
exec uvicorn app:app --host 0.0.0.0 --port 7860
|