Spaces:

kamkol
/

AB_Testing_RAG_Agent

Sleeping

App Files Files Community

kamkol commited on Apr 30, 2025

Commit

2585f8a

1 Parent(s): 52f5b9a

Better handling large preprocessed data file to Huggingface

Browse files

Files changed (11) hide show

.dockerignore +50 -6
.env.example +7 -5
.gitattributes +39 -0
.gitignore +2 -43
Dockerfile +14 -2
Makefile +2 -3
README.md +23 -1
download_pdfs.py +164 -0
huggingface-space.yml +1 -0
requirements.txt +3 -1
scripts/prepare_for_deployment.py +1 -3

.dockerignore CHANGED Viewed

@@ -44,11 +44,6 @@ notebook_version/
 # Do NOT exclude data directory, as we need to process PDFs during docker build if needed
 #/data
-# Chainlit-specific
-.chainlit
-chainlit.md
-chainlit.json
 # Misc
 .DS_Store
@@ -83,4 +78,53 @@ ENV/
 data/*.pdf
 # Pre-processed data (will be generated inside the container)
-processed_data/

 # Do NOT exclude data directory, as we need to process PDFs during docker build if needed
 #/data
 # Misc
 .DS_Store
 data/*.pdf
 # Pre-processed data (will be generated inside the container)
+processed_data/
+# Environment and secrets
+.env
+.venv
+venv/
+ENV/
+env/
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+# IDE specific files
+.idea/
+.vscode/
+*.swp
+*.swo
+# Mac OS
+.DS_Store
+# Git related
+.git/
+.gitignore
+.github/
+# Local data - exclude PDFs but include processed data
+/data/*.pdf
+/data/raw/
+# Debug and log files
+*.log
+debug_*.py
+*_debug.py
+debug_*.log
+run_log.txt
+# Test related
+.coverage
+.pytest_cache/
+pytest_cache/
+htmlcov/
+# Build artifacts
+__pycache__/
+*.py[cod]
+*$py.class
+*.so

.env.example CHANGED Viewed

@@ -1,10 +1,12 @@
 # OpenAI API key
-OPENAI_API_KEY=your_openai_api_key
 # Optional backup key if primary is not set
 OPENAI_API_KEY_BACKUP=
-# Chainlit configuration
-CHAINLIT_MAX_STEPS_HISTORY=10
-CHAINLIT_AUTH_SECRET=
-CHAINLIT_SERVER_URL=

 # OpenAI API key
+OPENAI_API_KEY=your_openai_api_key_here
 # Optional backup key if primary is not set
 OPENAI_API_KEY_BACKUP=
+# Hugging Face Token - required for downloading PDFs from the Hugging Face dataset
+HF_TOKEN=your_huggingface_token_here
+# Streamlit configuration (optional)
+# STREAMLIT_SERVER_PORT=8501
+# STREAMLIT_SERVER_HEADLESS=true

.gitattributes CHANGED Viewed

	@@ -1 +1,40 @@
1	processed_data/qdrant_vectorstore/collection/kohavi_ab_testing_pdf_collection/storage.sqlite filter=lfs diff=lfs merge=lfs -text

 processed_data/qdrant_vectorstore/collection/kohavi_ab_testing_pdf_collection/storage.sqlite filter=lfs diff=lfs merge=lfs -text
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# For SQLite database files in vector store
+*.sqlite filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -31,10 +31,6 @@ env/
 .env.test.local
 .env.production.local
-# Chainlit
-.chainlit/
-.chainlit/cache/
 # IDE specific files
 .idea/
 .vscode/
@@ -44,32 +40,13 @@ env/
 # Mac OS
 .DS_Store
-# Local data - keep PDFs private and never commit them
 /data/*.pdf
-/data/**/*.pdf
 # Do NOT ignore processed_data anymore - we want to commit this
 # /processed_data/
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-.DS_Store
-# Distribution / packaging
-dist/
-build/
-*.egg-info/
-# Virtual environment
-venv/
-env/
-.env
-# Environment variables
-.env.local
 # Debug and log files
 *.log
 debug_*.py
@@ -77,24 +54,6 @@ debug_*.py
 debug_*.log
 run_log.txt
-# Local data
-/data/*.pdf
-/data/raw/
-# Ignore Chainlit-specific files (using Streamlit instead)
-/.chainlit/
-chainlit.md
-chainlit.json
-# Keep processed data
-#/processed_data/
-# IDE and editor files
-.idea/
-.vscode/
-*.swp
-*.swo
 # Testing
 pytest_cache/
 .pytest_cache/

 .env.test.local
 .env.production.local
 # IDE specific files
 .idea/
 .vscode/
 # Mac OS
 .DS_Store
+# Local data - exclude PDFs but not directory structure
 /data/*.pdf
+/data/raw/
 # Do NOT ignore processed_data anymore - we want to commit this
 # /processed_data/
 # Debug and log files
 *.log
 debug_*.py
 debug_*.log
 run_log.txt
 # Testing
 pytest_cache/
 .pytest_cache/

Dockerfile CHANGED Viewed

@@ -30,6 +30,9 @@ RUN mkdir -p /root/.streamlit && \
     echo '[server]' >> /root/.streamlit/config.toml && \
     echo 'headless = true' >> /root/.streamlit/config.toml
 # Set environment variables
 ENV HOST=0.0.0.0
 ENV PORT=8000
@@ -40,9 +43,18 @@ EXPOSE $PORT
 # Create the entrypoint script
 RUN echo '#!/bin/bash' > /app/entrypoint.sh && \
     echo 'echo "Starting AB Testing RAG Agent"' >> /app/entrypoint.sh && \
     echo 'if [ ! -f "processed_data/document_chunks.pkl" ] || [ ! -d "processed_data/qdrant_vectorstore" ]; then' >> /app/entrypoint.sh && \
-    echo '    echo "No pre-processed data found. Running preprocessing..."' >> /app/entrypoint.sh && \
-    echo '    python scripts/preprocess_data.py' >> /app/entrypoint.sh && \
     echo 'else' >> /app/entrypoint.sh && \
     echo '    echo "Using existing pre-processed data"' >> /app/entrypoint.sh && \
     echo 'fi' >> /app/entrypoint.sh && \

     echo '[server]' >> /root/.streamlit/config.toml && \
     echo 'headless = true' >> /root/.streamlit/config.toml
+# Install additional required packages
+RUN pip install --no-cache-dir huggingface_hub datasets
 # Set environment variables
 ENV HOST=0.0.0.0
 ENV PORT=8000
 # Create the entrypoint script
 RUN echo '#!/bin/bash' > /app/entrypoint.sh && \
     echo 'echo "Starting AB Testing RAG Agent"' >> /app/entrypoint.sh && \
+    echo 'echo "Checking for pre-processed data..."' >> /app/entrypoint.sh && \
     echo 'if [ ! -f "processed_data/document_chunks.pkl" ] || [ ! -d "processed_data/qdrant_vectorstore" ]; then' >> /app/entrypoint.sh && \
+    echo '    echo "Pre-processed data not found. Downloading PDFs..."' >> /app/entrypoint.sh && \
+    echo '    if [ -n "${HF_TOKEN}" ]; then' >> /app/entrypoint.sh && \
+    echo '        python download_pdfs.py' >> /app/entrypoint.sh && \
+    echo '        echo "Running preprocessing..."' >> /app/entrypoint.sh && \
+    echo '        python scripts/preprocess_data.py' >> /app/entrypoint.sh && \
+    echo '    else' >> /app/entrypoint.sh && \
+    echo '        echo "Error: HF_TOKEN environment variable is not set. Cannot download PDFs."' >> /app/entrypoint.sh && \
+    echo '        echo "Please set the HF_TOKEN environment variable in your Hugging Face Space settings."' >> /app/entrypoint.sh && \
+    echo '        exit 1' >> /app/entrypoint.sh && \
+    echo '    fi' >> /app/entrypoint.sh && \
     echo 'else' >> /app/entrypoint.sh && \
     echo '    echo "Using existing pre-processed data"' >> /app/entrypoint.sh && \
     echo 'fi' >> /app/entrypoint.sh && \

Makefile CHANGED Viewed

@@ -4,7 +4,7 @@ setup:
 	python -m pip install -r requirements.txt
 run:
-	chainlit run app.py
 docker-build:
 	docker build -t ab-testing-rag-agent .
@@ -19,5 +19,4 @@ docker-compose-down:
 	docker-compose down
 clean:
-	rm -rf __pycache__
-	rm -rf .chainlit

 	python -m pip install -r requirements.txt
 run:
+	streamlit run streamlit_app.py
 docker-build:
 	docker build -t ab-testing-rag-agent .
 	docker-compose down
 clean:
+	rm -rf __pycache__

README.md CHANGED Viewed

@@ -79,7 +79,29 @@ git remote add hf https://huggingface.co/spaces/yourusername/ab-testing-rag
 git push hf main
 ```
-3. Set your OPENAI_API_KEY in the Hugging Face Space settings.
 ## Architecture

 git push hf main
 ```
+3. Set both required environment variables in the Hugging Face Space settings:
+   - `OPENAI_API_KEY`: Your OpenAI API key
+   - `HF_TOKEN`: Your Hugging Face token with access to the dataset
+### Setting Up The PDF Dataset on Hugging Face
+The deployment uses PDFs stored in a separate Hugging Face dataset repo. To set up your own:
+1. Create a dataset repository on Hugging Face called `yourusername/ab_testing_pdfs`
+2. Upload all your PDF files to this repository via the Hugging Face UI or git:
+   ```bash
+   git clone https://huggingface.co/datasets/yourusername/ab_testing_pdfs
+   cd ab_testing_pdfs
+   cp /path/to/your/pdfs/*.pdf .
+   git add .
+   git commit -m "Add AB Testing PDFs"
+   git push
+   ```
+3. Update the dataset name in `download_pdfs.py` if you used a different repository name
+4. Make sure your `HF_TOKEN` has read access to this dataset repository
 ## Architecture

download_pdfs.py ADDED Viewed

	@@ -0,0 +1,164 @@

+from datasets import load_dataset
+import os
+from huggingface_hub import hf_hub_download
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def download_pdfs():
+    """
+    Download PDF files from the Hugging Face dataset.
+    """
+    logger.info("Creating data directory if it doesn't exist")
+    os.makedirs("data", exist_ok=True)
+    try:
+        logger.info("Loading the dataset from kamkol/ab_testing_pdfs")
+        # Try to load the dataset first
+        dataset = load_dataset("kamkol/ab_testing_pdfs", use_auth_token=True)
+        # Check if we have files in the dataset
+        if 'train' in dataset and len(dataset['train']) > 0:
+            logger.info(f"Found {len(dataset['train'])} files in dataset")
+            # Handle dataset format that uses binary field
+            if 'binary' in dataset['train'].features:
+                for i, item in enumerate(dataset['train']):
+                    filename = item["filename"] if "filename" in item else f"document_{i}.pdf"
+                    with open(f"data/{filename}", "wb") as f:
+                        f.write(item["binary"])
+                    logger.info(f"Downloaded: {filename}")
+            # Alternative approach for direct file access
+            else:
+                # List all PDF files in the repository
+                logger.info("Dataset doesn't have binary field, trying direct file download")
+                for i, item in enumerate(dataset['train']):
+                    # Get filename from the dataset if available
+                    if 'filename' in item:
+                        filename = item['filename']
+                    else:
+                        logger.warning(f"No filename found for item {i}, using default")
+                        filename = f"document_{i}.pdf"
+                    # Download the file
+                    try:
+                        file_path = hf_hub_download(
+                            repo_id="kamkol/ab_testing_pdfs",
+                            filename=filename,
+                            repo_type="dataset",
+                            use_auth_token=True
+                        )
+                        # Copy to data directory
+                        with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
+                            dst.write(src.read())
+                        logger.info(f"Downloaded: {filename}")
+                    except Exception as e:
+                        logger.error(f"Error downloading {filename}: {str(e)}")
+        else:
+            # Fall back to direct file download from the repository
+            logger.info("No files found in dataset train split, trying direct repository access")
+            # List of AB Testing PDF files - use the exact filenames from your data directory
+            pdf_files = [
+                "Shipping Flat Treatments in Online Controlled Experiments.pdf",
+                "Companies with really small traffic.pdf",
+                "Major Redesigns Usually Fail.pdf",
+                "Capping Metrics Linkedin Post.pdf",
+                "When to Use Bayesian vs Frequentist.pdf",
+                "Trustworthy AB Patterns.pdf",
+                "Why are Power Calculators Giving Different Results.pdf",
+                "Practical Defaults for AB Testing.pdf",
+                "P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf",
+                "Statistical Challenges in Online Controlled Experiments  A Review of A B Testing Methodology.pdf",
+                "Top Challenges from the First Practical Online Controlled Experiments Summit.pdf",
+                "CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf",
+                "TriggeringRuleOfThumb.pdf",
+                "AB Testing Intuition Busters.pdf",
+                "The Surprising Power of Online Experiments.pdf",
+                "What Should the Primary Metric Be for Experimentation Platforms.pdf",
+                "Online Controlled Experiments at Large Scale.pdf",
+                "Online Controlled Experiments and AB Tests.pdf",
+                "Seven Rules of Thumb for Web Site Experimenters.pdf",
+                "Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf",
+                "Pvalue Misinterpretations Annotated References.pdf",
+                "Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf",
+                "Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf",
+                "False Positives In AB Tests.pdf",
+                "emetrics Amazon.pdf",
+                "Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf",
+                "Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf",
+                "Controlled Experiments on the Web Survey and Practical Guide.pdf"
+            ]
+            for filename in pdf_files:
+                try:
+                    file_path = hf_hub_download(
+                        repo_id="kamkol/ab_testing_pdfs",
+                        filename=filename,
+                        repo_type="dataset",
+                        use_auth_token=True
+                    )
+                    # Copy to data directory
+                    with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
+                        dst.write(src.read())
+                    logger.info(f"Downloaded: {filename}")
+                except Exception as e:
+                    logger.error(f"Error downloading {filename}: {str(e)}")
+    except Exception as e:
+        logger.error(f"Error loading dataset: {str(e)}")
+        logger.info("Falling back to direct file download")
+        # List of PDF files to download - same as above
+        pdf_files = [
+            "Shipping Flat Treatments in Online Controlled Experiments.pdf",
+            "Companies with really small traffic.pdf",
+            "Major Redesigns Usually Fail.pdf",
+            "Capping Metrics Linkedin Post.pdf",
+            "When to Use Bayesian vs Frequentist.pdf",
+            "Trustworthy AB Patterns.pdf",
+            "Why are Power Calculators Giving Different Results.pdf",
+            "Practical Defaults for AB Testing.pdf",
+            "P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf",
+            "Statistical Challenges in Online Controlled Experiments  A Review of A B Testing Methodology.pdf",
+            "Top Challenges from the First Practical Online Controlled Experiments Summit.pdf",
+            "CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf",
+            "TriggeringRuleOfThumb.pdf",
+            "AB Testing Intuition Busters.pdf",
+            "The Surprising Power of Online Experiments.pdf",
+            "What Should the Primary Metric Be for Experimentation Platforms.pdf",
+            "Online Controlled Experiments at Large Scale.pdf",
+            "Online Controlled Experiments and AB Tests.pdf",
+            "Seven Rules of Thumb for Web Site Experimenters.pdf",
+            "Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf",
+            "Pvalue Misinterpretations Annotated References.pdf",
+            "Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf",
+            "Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf",
+            "False Positives In AB Tests.pdf",
+            "emetrics Amazon.pdf",
+            "Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf",
+            "Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf",
+            "Controlled Experiments on the Web Survey and Practical Guide.pdf"
+        ]
+        for filename in pdf_files:
+            try:
+                file_path = hf_hub_download(
+                    repo_id="kamkol/ab_testing_pdfs",
+                    filename=filename,
+                    repo_type="dataset",
+                    use_auth_token=True
+                )
+                # Copy to data directory
+                with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
+                    dst.write(src.read())
+                logger.info(f"Downloaded: {filename}")
+            except Exception as e:
+                logger.error(f"Error downloading {filename}: {str(e)}")
+if __name__ == "__main__":
+    download_pdfs()

huggingface-space.yml CHANGED Viewed

@@ -9,6 +9,7 @@ license: mit
 # Environment variables (fill these in on Hugging Face)
 # OPENAI_API_KEY: Your OpenAI API Key
 # Configuration for Dockerfile
 dockerfile:

 # Environment variables (fill these in on Hugging Face)
 # OPENAI_API_KEY: Your OpenAI API Key
+# HF_TOKEN: Your Hugging Face Token with access to the AB_testing_pdfs dataset
 # Configuration for Dockerfile
 dockerfile:

requirements.txt CHANGED Viewed

@@ -11,4 +11,6 @@ python-dotenv==1.0.1
 unstructured==0.12.5
 pypdf==3.17.4
 numpy==1.26.3
-requests==2.31.0

 unstructured==0.12.5
 pypdf==3.17.4
 numpy==1.26.3
+requests==2.31.0
+huggingface_hub==0.20.3
+datasets==2.16.0

scripts/prepare_for_deployment.py CHANGED Viewed

@@ -82,8 +82,6 @@ def list_deployment_files():
         "requirements.txt",
         "Dockerfile",
         "docker-compose.yml",
-        "chainlit.md",
-        "chainlit.json",
         "README.md",
         "scripts/docker-entrypoint.sh",
         ".dockerignore",
@@ -119,7 +117,7 @@ To deploy to Hugging Face Spaces:
    git remote add huggingface https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
 5. Stage the necessary files (do NOT include PDF files):
-   git add app.py requirements.txt Dockerfile docker-compose.yml chainlit.md chainlit.json README.md scripts/docker-entrypoint.sh .dockerignore processed_data/
 6. Commit the changes:
    git commit -m "Prepare for Hugging Face deployment with pre-processed data"

         "requirements.txt",
         "Dockerfile",
         "docker-compose.yml",
         "README.md",
         "scripts/docker-entrypoint.sh",
         ".dockerignore",
    git remote add huggingface https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
 5. Stage the necessary files (do NOT include PDF files):
+   git add app.py requirements.txt Dockerfile docker-compose.yml README.md scripts/docker-entrypoint.sh .dockerignore processed_data/
 6. Commit the changes:
    git commit -m "Prepare for Hugging Face deployment with pre-processed data"