ChAbhishek28 commited on
Commit
547fc39
Β·
1 Parent(s): 1a09a45

πŸ”§ Add LFS data download script for HF Spaces deployment

Browse files

- Created download_lfs_data.py to fetch 11,555 documents from LFS
- Added startup.sh to download data before app starts
- Added huggingface-hub==0.26.5 for snapshot_download
- Modified Dockerfile to use startup script
- Fixes: Dataset not found errors in production

Files changed (4) hide show
  1. Dockerfile +6 -3
  2. download_lfs_data.py +53 -0
  3. requirements.txt +1 -0
  4. startup.sh +13 -0
Dockerfile CHANGED
@@ -1,7 +1,7 @@
1
  # Use Python 3.12 as specified
2
  FROM python:3.12-slim
3
 
4
- # Install minimal system dependencies
5
  RUN apt-get update && apt-get install -y --no-install-recommends \
6
  curl \
7
  && rm -rf /var/lib/apt/lists/*
@@ -24,6 +24,9 @@ RUN pip install --no-cache-dir --upgrade pip && \
24
  # Copy the application code
25
  COPY --chown=user . /app
26
 
 
 
 
27
  # Expose the port that HF Spaces requires
28
  EXPOSE 7860
29
 
@@ -35,5 +38,5 @@ ENV PYTHONUNBUFFERED=1
35
  HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
36
  CMD curl -f http://localhost:7860/health || exit 1
37
 
38
- # Run the application
39
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  # Use Python 3.12 as specified
2
  FROM python:3.12-slim
3
 
4
+ # Install minimal system dependencies including git-lfs for database files
5
  RUN apt-get update && apt-get install -y --no-install-recommends \
6
  curl \
7
  && rm -rf /var/lib/apt/lists/*
 
24
  # Copy the application code
25
  COPY --chown=user . /app
26
 
27
+ # Make startup script executable
28
+ RUN chmod +x startup.sh
29
+
30
  # Expose the port that HF Spaces requires
31
  EXPOSE 7860
32
 
 
38
  HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
39
  CMD curl -f http://localhost:7860/health || exit 1
40
 
41
+ # Run the application with LFS data download
42
+ CMD ["./startup.sh"]
download_lfs_data.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Download LanceDB data from Hugging Face LFS storage at startup.
4
+ This script downloads the Git LFS files that contain the 11,555 document database.
5
+ """
6
+ import os
7
+ import subprocess
8
+ from pathlib import Path
9
+
10
+ def download_lfs_files():
11
+ """Download Git LFS files from the Hugging Face Space repository."""
12
+ print("πŸ”½ Checking for LanceDB data...")
13
+
14
+ lancedb_path = Path("lancedb_data")
15
+
16
+ # Check if database already exists and has data
17
+ if lancedb_path.exists():
18
+ rajasthan_docs = lancedb_path / "rajasthan_documents.lance"
19
+ if rajasthan_docs.exists():
20
+ # Check if it's a real file (not LFS pointer)
21
+ version_file = rajasthan_docs / "_versions"
22
+ if version_file.exists():
23
+ print("βœ… LanceDB data already downloaded")
24
+ return True
25
+ else:
26
+ print("⚠️ LanceDB data is LFS pointer - downloading actual files...")
27
+
28
+ # Try to download LFS files using huggingface_hub
29
+ try:
30
+ from huggingface_hub import snapshot_download
31
+
32
+ space_id = "ChAbhishek28/PensionBot"
33
+ print(f"πŸ“₯ Downloading LFS files from {space_id}...")
34
+
35
+ # Download only the lancedb_data folder
36
+ snapshot_download(
37
+ repo_id=space_id,
38
+ repo_type="space",
39
+ allow_patterns="lancedb_data/**",
40
+ local_dir=".",
41
+ local_dir_use_symlinks=False,
42
+ )
43
+
44
+ print("βœ… LFS files downloaded successfully")
45
+ return True
46
+
47
+ except Exception as e:
48
+ print(f"❌ Failed to download LFS files: {e}")
49
+ print("⚠️ Voice Bot will start with minimal documents")
50
+ return False
51
+
52
+ if __name__ == "__main__":
53
+ download_lfs_files()
requirements.txt CHANGED
@@ -30,6 +30,7 @@ langchain-google-genai==2.0.7
30
  # Vector Database & Embeddings
31
  lancedb==0.13.0
32
  sentence-transformers==3.3.1
 
33
  einops==0.8.0
34
 
35
  # Data Processing
 
30
  # Vector Database & Embeddings
31
  lancedb==0.13.0
32
  sentence-transformers==3.3.1
33
+ huggingface-hub==0.26.5
34
  einops==0.8.0
35
 
36
  # Data Processing
startup.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Startup script for Hugging Face Spaces deployment
3
+ # Downloads LFS database files and starts the application
4
+
5
+ echo "πŸš€ Voice Bot Startup Script"
6
+
7
+ # Download LFS files (11,555 documents database)
8
+ echo "πŸ“₯ Downloading LanceDB data..."
9
+ python3 download_lfs_data.py
10
+
11
+ # Start the application
12
+ echo "🎀 Starting Voice Bot..."
13
+ exec uvicorn app:app --host 0.0.0.0 --port 7860