Spaces:

olamideba
/

astra

Sleeping

App Files Files Community

olamideba commited on Jan 2

Commit

55609c0

1 Parent(s): 7049205

add huggingface hub support

Browse files

Files changed (9) hide show

.dockerignore +29 -0
Dockerfile +28 -0
README.md +45 -0
app/scripts/__init__.py +2 -0
app/scripts/download_assets.py +203 -0
app/src/llm.py +1 -1
app/src/sentence.py +23 -6
app/src/settings.py +5 -0
requirements.txt +1 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,29 @@

+# Ignore local environments
+venv/
+.venv/
+env/
+# Ignore Python cache
+**/__pycache__/
+*.pyc
+# Ignore large data or logs
+*.log
+data/
+*.csv
+*.sqlite
+# Ignore Git history
+.git
+.gitignore
+# Ignore local IDE settings
+.vscode/
+.idea/
+# Models anf embeddings
+.chroma/
+embeddings/
+data/
+models/

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.11-slim
+WORKDIR /
+# Install curl for healthcheck
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# RUN git clone https://github.com/mujeeb-gh/rag-chatbot-final.git .
+COPY . .
+RUN pip3 install -r requirements.txt
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+WORKDIR /app
+# Create entrypoint script that downloads assets before starting Streamlit
+RUN echo '#!/bin/bash\n\
+python3 /app/scripts/download_assets.py\n\
+exec streamlit run main.py --server.port=8501 --server.address=0.0.0.0\n\
+' > /app/entrypoint.sh && chmod +x /app/entrypoint.sh
+ENTRYPOINT ["/app/entrypoint.sh"]

README.md CHANGED Viewed

@@ -6,6 +6,17 @@ To setup this project locally, you need to have a `.env` file in the root direct
 .venv is the new virtual env
 Then, install the dependencies:
@@ -19,6 +30,40 @@ Without development dependencies:
 pip install -r requirements.txt
 ```
 ```
 astra
 ├─ .chroma

 .venv is the new virtual env
+### Environment Variables
+Required for API keys:
+- `GROQ_API_KEY`: Your Groq API key (for LLM)
+- `OPENAI_API_KEY`: Your OpenAI API key (optional, for OpenAI models)
+- `COHERE_API_KEY`: Your Cohere API key (optional)
+For Docker deployment with HuggingFace Hub:
+- `HF_MODELS_REPO`: HuggingFace Hub repository for models (e.g., "username/astra-models")
+- `HF_CHROMADB_REPO`: HuggingFace Hub repository for ChromaDB embeddings (e.g., "username/astra-chromadb")
+- `HF_TOKEN`: HuggingFace Hub token (required for private repositories)
 Then, install the dependencies:
 pip install -r requirements.txt
 ```
+## Docker Deployment
+The Dockerfile is configured to automatically download models and ChromaDB embeddings from HuggingFace Hub at container startup.
+### Setting up HuggingFace Hub
+1. Create a HuggingFace account at https://huggingface.co/
+2. Create repositories for your models and ChromaDB:
+   - Create a repository for models (e.g., `your-username/astra-models`)
+   - Upload your model directories (`bge-large_finetuned/`, `bge-small_finetuned/`) to this repository
+   - Create a repository for ChromaDB (e.g., `your-username/astra-chromadb`)
+   - Compress your `.chroma/` directory and upload it as `chromadb.tar.gz` or `chromadb.zip`
+3. Set environment variables when running Docker:
+   ```bash
+   docker run -e HF_MODELS_REPO=your-username/astra-models \
+              -e HF_CHROMADB_REPO=your-username/astra-chromadb \
+              -e HF_TOKEN=your_hf_token \
+              -e GROQ_API_KEY=your_groq_key \
+              -p 8501:8501 your-image-name
+   ```
+   Or use a `.env` file with Docker Compose or `--env-file` flag.
+### Local Development
+For local development, you can place models and ChromaDB files in the local directories:
+- Models go in `models/` directory
+- ChromaDB goes in `.chroma/` directory
+The code will automatically use local files if available, falling back to HuggingFace Hub if not found.
+## Project Structure
 ```
 astra
 ├─ .chroma

app/scripts/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Scripts package
2	+

app/scripts/download_assets.py ADDED Viewed

	@@ -0,0 +1,203 @@

+"""
+Download assets (models and ChromaDB) from HuggingFace Hub if not already present locally.
+This script runs at container startup to ensure required files are available.
+"""
+import os
+import shutil
+import sys
+import tarfile
+import zipfile
+from pathlib import Path
+try:
+    from huggingface_hub import snapshot_download, hf_hub_download
+    from huggingface_hub.utils import HfHubHTTPError
+except ImportError:
+    print("ERROR: huggingface_hub not installed. Please install it first.")
+    sys.exit(1)
+def get_project_root():
+    """Get the project root directory."""
+    # This script is in app/scripts/, so go up two levels
+    script_dir = Path(__file__).parent
+    return script_dir.parent.parent
+def download_models(models_repo: str, models_dir: Path, hf_token: str | None = None) -> None:
+    """
+    Download models from HuggingFace Hub if not present locally.
+    Args:
+        models_repo: HuggingFace Hub repository (e.g., "username/astra-models")
+        models_dir: Local directory to store models
+        hf_token: Optional HuggingFace token for private repos
+    """
+    if not models_repo:
+        print("WARNING: HF_MODELS_REPO not set. Skipping model download.")
+        return
+    print(f"Checking models in {models_dir}...")
+    # Check if models directory already has content
+    if models_dir.exists() and any(models_dir.iterdir()):
+        print(f"Models directory already contains files. Skipping download.")
+        print(f"To force re-download, delete {models_dir} and restart.")
+        return
+    print(f"Downloading models from {models_repo}...")
+    try:
+        # Ensure models directory exists
+        models_dir.mkdir(parents=True, exist_ok=True)
+        # Download the entire repository
+        snapshot_download(
+            repo_id=models_repo,
+            local_dir=str(models_dir),
+            token=hf_token,
+            resume_download=True,
+        )
+        print(f"✓ Models downloaded successfully to {models_dir}")
+    except HfHubHTTPError as e:
+        print(f"ERROR: Failed to download models from {models_repo}")
+        print(f"Error: {e}")
+        print("Make sure the repository exists and is accessible.")
+        sys.exit(1)
+    except Exception as e:
+        print(f"ERROR: Unexpected error while downloading models: {e}")
+        sys.exit(1)
+def download_chromadb(chromadb_repo: str, chromadb_dir: Path, hf_token: str | None = None) -> None:
+    """
+    Download ChromaDB archive from HuggingFace Hub and extract it.
+    Args:
+        chromadb_repo: HuggingFace Hub repository (e.g., "username/astra-chromadb")
+        chromadb_dir: Local directory for ChromaDB
+        hf_token: Optional HuggingFace token for private repos
+    """
+    if not chromadb_repo:
+        print("WARNING: HF_CHROMADB_REPO not set. Skipping ChromaDB download.")
+        return
+    print(f"Checking ChromaDB in {chromadb_dir}...")
+    # Check if ChromaDB directory already has content
+    if chromadb_dir.exists() and any(chromadb_dir.iterdir()):
+        print(f"ChromaDB directory already contains files. Skipping download.")
+        print(f"To force re-download, delete {chromadb_dir} and restart.")
+        return
+    print(f"Downloading ChromaDB from {chromadb_repo}...")
+    try:
+        # Ensure chromadb directory exists
+        chromadb_dir.mkdir(parents=True, exist_ok=True)
+        # Try common archive filenames
+        archive_names = ["chromadb.tar.gz", "chromadb.zip", "chroma.tar.gz", "chroma.zip", ".chroma.tar.gz", ".chroma.zip"]
+        downloaded = False
+        for archive_name in archive_names:
+            try:
+                archive_path = hf_hub_download(
+                    repo_id=chromadb_repo,
+                    filename=archive_name,
+                    local_dir=str(chromadb_dir.parent),
+                    token=hf_token,
+                    resume_download=True,
+                )
+                # Extract the archive
+                print(f"Extracting {archive_name}...")
+                if archive_name.endswith('.tar.gz'):
+                    with tarfile.open(archive_path, 'r:gz') as tar:
+                        # Get members and check if they're in a subdirectory
+                        members = tar.getmembers()
+                        # Extract to parent directory
+                        tar.extractall(path=chromadb_dir.parent)
+                        # If archive contains .chroma subdirectory, move contents up
+                        extracted_chroma = chromadb_dir.parent / ".chroma"
+                        if extracted_chroma.exists() and extracted_chroma.is_dir():
+                            # Move contents from .chroma to chromadb_dir
+                            for item in extracted_chroma.iterdir():
+                                shutil.move(str(item), str(chromadb_dir / item.name))
+                            extracted_chroma.rmdir()
+                elif archive_name.endswith('.zip'):
+                    with zipfile.ZipFile(archive_path, 'r') as zip_ref:
+                        zip_ref.extractall(path=chromadb_dir.parent)
+                        # If archive contains .chroma subdirectory, move contents up
+                        extracted_chroma = chromadb_dir.parent / ".chroma"
+                        if extracted_chroma.exists() and extracted_chroma.is_dir():
+                            # Move contents from .chroma to chromadb_dir
+                            for item in extracted_chroma.iterdir():
+                                shutil.move(str(item), str(chromadb_dir / item.name))
+                            extracted_chroma.rmdir()
+                # Clean up the archive file
+                os.remove(archive_path)
+                print(f"✓ ChromaDB downloaded and extracted successfully to {chromadb_dir}")
+                downloaded = True
+                break
+            except HfHubHTTPError:
+                # Try next archive name
+                continue
+        if not downloaded:
+            # If no archive found, try downloading as a snapshot (directory structure)
+            print("No archive found, trying to download as directory snapshot...")
+            snapshot_download(
+                repo_id=chromadb_repo,
+                local_dir=str(chromadb_dir),
+                token=hf_token,
+                resume_download=True,
+            )
+            print(f"✓ ChromaDB downloaded successfully to {chromadb_dir}")
+    except HfHubHTTPError as e:
+        print(f"ERROR: Failed to download ChromaDB from {chromadb_repo}")
+        print(f"Error: {e}")
+        print("Make sure the repository exists and is accessible.")
+        sys.exit(1)
+    except Exception as e:
+        print(f"ERROR: Unexpected error while downloading ChromaDB: {e}")
+        sys.exit(1)
+def main():
+    """Main function to download all required assets."""
+    print("=" * 60)
+    print("Downloading assets from HuggingFace Hub...")
+    print("=" * 60)
+    project_root = get_project_root()
+    models_dir = project_root / "models"
+    chromadb_dir = project_root / ".chroma"
+    # Get configuration from environment variables
+    models_repo = os.getenv("HF_MODELS_REPO", "")
+    chromadb_repo = os.getenv("HF_CHROMADB_REPO", "")
+    hf_token = os.getenv("HF_TOKEN", None)
+    # Download models
+    if models_repo:
+        download_models(models_repo, models_dir, hf_token)
+    else:
+        print("INFO: HF_MODELS_REPO not configured. Models must be available locally.")
+    # Download ChromaDB
+    if chromadb_repo:
+        download_chromadb(chromadb_repo, chromadb_dir, hf_token)
+    else:
+        print("INFO: HF_CHROMADB_REPO not configured. ChromaDB must be available locally.")
+    print("=" * 60)
+    print("Asset download complete!")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

app/src/llm.py CHANGED Viewed

@@ -9,7 +9,7 @@ load_dotenv()
 CHAT_MODEL = Literal["llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768", "gemma-7b-it"]
 groq_api_key = os.getenv('GROQ_API_KEY')
-openrouter_api_key = os.getenv('OPENROUTER_API_KEY')
 client = Groq(
     api_key=groq_api_key,

 CHAT_MODEL = Literal["llama3-8b-8192", "llama3-70b-8192", "mixtral-8x7b-32768", "gemma-7b-it"]
 groq_api_key = os.getenv('GROQ_API_KEY')
+openrouter_api_key = settings.openrouter_api_key
 client = Groq(
     api_key=groq_api_key,

app/src/sentence.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Literal, List
 import numpy as np
 import os
@@ -10,19 +10,36 @@ EMBED_MODEL = Literal["BAAI/bge-small-en-v1.5", "BAAI/bge-base-en-v1.5", "BAAI/b
 def sentence_embed(
-  texts: str | List[str], model_name_or_path: EMBED_MODEL = "BAAI/bge-large-en-v1.5", device: str = "cpu"
 ) -> list[list[float]]:
   """
     Embeds the given texts using the specified model.
     Args:
-        texts (str | List[str], str]): The list of texts or text to embed.
-        model (EMBED_MODEL): The embedding model to use.
     Returns:
-        np.ndarray: The embeddings of the texts.
     """
-  model = SentenceTransformer(os.path.join(MODELS_DIR, model_name_or_path))
   embeddings: np.ndarray = model.encode(sentences=texts, device=device, show_progress_bar=True)
   embeddings_list: list = embeddings.tolist()
   return embeddings_list

+from typing import Literal, List, Union
 import numpy as np
 import os
 def sentence_embed(
+  texts: str | List[str], model_name_or_path: Union[str, EMBED_MODEL] = "BAAI/bge-large-en-v1.5", device: str = "cpu"
 ) -> list[list[float]]:
   """
     Embeds the given texts using the specified model.
     Args:
+        texts (str | List[str]): The list of texts or text to embed.
+        model_name_or_path (Union[str, EMBED_MODEL]): The embedding model to use.
+            Can be:
+            - A HuggingFace Hub identifier (e.g., "BAAI/bge-large-en-v1.5" or "username/repo-name")
+            - A local path relative to MODELS_DIR (e.g., "bge-small_finetuned")
+            - An absolute path
+        device (str): Device to use for encoding (default: "cpu").
     Returns:
+        list[list[float]]: The embeddings of the texts.
     """
+  # Check if it's a local path (starts with / or ./ or exists in MODELS_DIR)
+  local_model_path = os.path.join(MODELS_DIR, model_name_or_path)
+  # If it's a HuggingFace Hub identifier (contains /) or local path exists, use it directly
+  # SentenceTransformer handles both HF Hub identifiers and local paths
+  if os.path.exists(local_model_path):
+    model_path = local_model_path
+  else:
+    # Assume it's either an HF Hub identifier or a local path that doesn't exist yet
+    # SentenceTransformer will handle HF Hub downloads automatically
+    model_path = model_name_or_path
+  model = SentenceTransformer(model_path)
   embeddings: np.ndarray = model.encode(sentences=texts, device=device, show_progress_bar=True)
   embeddings_list: list = embeddings.tolist()
   return embeddings_list

app/src/settings.py CHANGED Viewed

@@ -17,6 +17,11 @@ class Settings(BaseSettings):
     cohere_api_key: str = ""
     groq_api_key: str = ""
     openai_api_key: str = ""
 settings = Settings()

     cohere_api_key: str = ""
     groq_api_key: str = ""
     openai_api_key: str = ""
+    openrouter_api_key: str = ""
+    # HuggingFace Hub configuration
+    hf_models_repo: str = os.getenv("HF_MODELS_REPO", "")
+    hf_chromadb_repo: str = os.getenv("HF_CHROMADB_REPO", "")
+    hf_token: str = os.getenv("HF_TOKEN", "")
 settings = Settings()

requirements.txt CHANGED Viewed

@@ -2,6 +2,7 @@ chromadb==0.5.0
 datasets==2.19.0
 evaluate==0.4.2
 groq==1.0.0
 numpy==1.24.3
 openai==2.14.0
 pandas==2.0.3

 datasets==2.19.0
 evaluate==0.4.2
 groq==1.0.0
+huggingface_hub==0.24.0
 numpy==1.24.3
 openai==2.14.0
 pandas==2.0.3