Spaces:
Sleeping
Sleeping
Better handling large preprocessed data file to Huggingface
Browse files- .dockerignore +50 -6
- .env.example +7 -5
- .gitattributes +39 -0
- .gitignore +2 -43
- Dockerfile +14 -2
- Makefile +2 -3
- README.md +23 -1
- download_pdfs.py +164 -0
- huggingface-space.yml +1 -0
- requirements.txt +3 -1
- scripts/prepare_for_deployment.py +1 -3
.dockerignore
CHANGED
|
@@ -44,11 +44,6 @@ notebook_version/
|
|
| 44 |
# Do NOT exclude data directory, as we need to process PDFs during docker build if needed
|
| 45 |
#/data
|
| 46 |
|
| 47 |
-
# Chainlit-specific
|
| 48 |
-
.chainlit
|
| 49 |
-
chainlit.md
|
| 50 |
-
chainlit.json
|
| 51 |
-
|
| 52 |
# Misc
|
| 53 |
.DS_Store
|
| 54 |
|
|
@@ -83,4 +78,53 @@ ENV/
|
|
| 83 |
data/*.pdf
|
| 84 |
|
| 85 |
# Pre-processed data (will be generated inside the container)
|
| 86 |
-
processed_data/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
# Do NOT exclude data directory, as we need to process PDFs during docker build if needed
|
| 45 |
#/data
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
# Misc
|
| 48 |
.DS_Store
|
| 49 |
|
|
|
|
| 78 |
data/*.pdf
|
| 79 |
|
| 80 |
# Pre-processed data (will be generated inside the container)
|
| 81 |
+
processed_data/
|
| 82 |
+
|
| 83 |
+
# Environment and secrets
|
| 84 |
+
.env
|
| 85 |
+
.venv
|
| 86 |
+
venv/
|
| 87 |
+
ENV/
|
| 88 |
+
env/
|
| 89 |
+
.env.local
|
| 90 |
+
.env.development.local
|
| 91 |
+
.env.test.local
|
| 92 |
+
.env.production.local
|
| 93 |
+
|
| 94 |
+
# IDE specific files
|
| 95 |
+
.idea/
|
| 96 |
+
.vscode/
|
| 97 |
+
*.swp
|
| 98 |
+
*.swo
|
| 99 |
+
|
| 100 |
+
# Mac OS
|
| 101 |
+
.DS_Store
|
| 102 |
+
|
| 103 |
+
# Git related
|
| 104 |
+
.git/
|
| 105 |
+
.gitignore
|
| 106 |
+
.github/
|
| 107 |
+
|
| 108 |
+
# Local data - exclude PDFs but include processed data
|
| 109 |
+
/data/*.pdf
|
| 110 |
+
/data/raw/
|
| 111 |
+
|
| 112 |
+
# Debug and log files
|
| 113 |
+
*.log
|
| 114 |
+
debug_*.py
|
| 115 |
+
*_debug.py
|
| 116 |
+
debug_*.log
|
| 117 |
+
run_log.txt
|
| 118 |
+
|
| 119 |
+
# Test related
|
| 120 |
+
.coverage
|
| 121 |
+
.pytest_cache/
|
| 122 |
+
pytest_cache/
|
| 123 |
+
htmlcov/
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# Build artifacts
|
| 127 |
+
__pycache__/
|
| 128 |
+
*.py[cod]
|
| 129 |
+
*$py.class
|
| 130 |
+
*.so
|
.env.example
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
# OpenAI API key
|
| 2 |
-
OPENAI_API_KEY=
|
| 3 |
|
| 4 |
# Optional backup key if primary is not set
|
| 5 |
OPENAI_API_KEY_BACKUP=
|
| 6 |
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
| 1 |
# OpenAI API key
|
| 2 |
+
OPENAI_API_KEY=your_openai_api_key_here
|
| 3 |
|
| 4 |
# Optional backup key if primary is not set
|
| 5 |
OPENAI_API_KEY_BACKUP=
|
| 6 |
|
| 7 |
+
# Hugging Face Token - required for downloading PDFs from the Hugging Face dataset
|
| 8 |
+
HF_TOKEN=your_huggingface_token_here
|
| 9 |
+
|
| 10 |
+
# Streamlit configuration (optional)
|
| 11 |
+
# STREAMLIT_SERVER_PORT=8501
|
| 12 |
+
# STREAMLIT_SERVER_HEADLESS=true
|
.gitattributes
CHANGED
|
@@ -1 +1,40 @@
|
|
| 1 |
processed_data/qdrant_vectorstore/collection/kohavi_ab_testing_pdf_collection/storage.sqlite filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
processed_data/qdrant_vectorstore/collection/kohavi_ab_testing_pdf_collection/storage.sqlite filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
|
| 3 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
|
| 39 |
+
# For SQLite database files in vector store
|
| 40 |
+
*.sqlite filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
|
@@ -31,10 +31,6 @@ env/
|
|
| 31 |
.env.test.local
|
| 32 |
.env.production.local
|
| 33 |
|
| 34 |
-
# Chainlit
|
| 35 |
-
.chainlit/
|
| 36 |
-
.chainlit/cache/
|
| 37 |
-
|
| 38 |
# IDE specific files
|
| 39 |
.idea/
|
| 40 |
.vscode/
|
|
@@ -44,32 +40,13 @@ env/
|
|
| 44 |
# Mac OS
|
| 45 |
.DS_Store
|
| 46 |
|
| 47 |
-
# Local data -
|
| 48 |
/data/*.pdf
|
| 49 |
-
/data/
|
| 50 |
|
| 51 |
# Do NOT ignore processed_data anymore - we want to commit this
|
| 52 |
# /processed_data/
|
| 53 |
|
| 54 |
-
# Byte-compiled / optimized / DLL files
|
| 55 |
-
__pycache__/
|
| 56 |
-
*.py[cod]
|
| 57 |
-
*$py.class
|
| 58 |
-
.DS_Store
|
| 59 |
-
|
| 60 |
-
# Distribution / packaging
|
| 61 |
-
dist/
|
| 62 |
-
build/
|
| 63 |
-
*.egg-info/
|
| 64 |
-
|
| 65 |
-
# Virtual environment
|
| 66 |
-
venv/
|
| 67 |
-
env/
|
| 68 |
-
.env
|
| 69 |
-
|
| 70 |
-
# Environment variables
|
| 71 |
-
.env.local
|
| 72 |
-
|
| 73 |
# Debug and log files
|
| 74 |
*.log
|
| 75 |
debug_*.py
|
|
@@ -77,24 +54,6 @@ debug_*.py
|
|
| 77 |
debug_*.log
|
| 78 |
run_log.txt
|
| 79 |
|
| 80 |
-
# Local data
|
| 81 |
-
/data/*.pdf
|
| 82 |
-
/data/raw/
|
| 83 |
-
|
| 84 |
-
# Ignore Chainlit-specific files (using Streamlit instead)
|
| 85 |
-
/.chainlit/
|
| 86 |
-
chainlit.md
|
| 87 |
-
chainlit.json
|
| 88 |
-
|
| 89 |
-
# Keep processed data
|
| 90 |
-
#/processed_data/
|
| 91 |
-
|
| 92 |
-
# IDE and editor files
|
| 93 |
-
.idea/
|
| 94 |
-
.vscode/
|
| 95 |
-
*.swp
|
| 96 |
-
*.swo
|
| 97 |
-
|
| 98 |
# Testing
|
| 99 |
pytest_cache/
|
| 100 |
.pytest_cache/
|
|
|
|
| 31 |
.env.test.local
|
| 32 |
.env.production.local
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# IDE specific files
|
| 35 |
.idea/
|
| 36 |
.vscode/
|
|
|
|
| 40 |
# Mac OS
|
| 41 |
.DS_Store
|
| 42 |
|
| 43 |
+
# Local data - exclude PDFs but not directory structure
|
| 44 |
/data/*.pdf
|
| 45 |
+
/data/raw/
|
| 46 |
|
| 47 |
# Do NOT ignore processed_data anymore - we want to commit this
|
| 48 |
# /processed_data/
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
# Debug and log files
|
| 51 |
*.log
|
| 52 |
debug_*.py
|
|
|
|
| 54 |
debug_*.log
|
| 55 |
run_log.txt
|
| 56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
# Testing
|
| 58 |
pytest_cache/
|
| 59 |
.pytest_cache/
|
Dockerfile
CHANGED
|
@@ -30,6 +30,9 @@ RUN mkdir -p /root/.streamlit && \
|
|
| 30 |
echo '[server]' >> /root/.streamlit/config.toml && \
|
| 31 |
echo 'headless = true' >> /root/.streamlit/config.toml
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
# Set environment variables
|
| 34 |
ENV HOST=0.0.0.0
|
| 35 |
ENV PORT=8000
|
|
@@ -40,9 +43,18 @@ EXPOSE $PORT
|
|
| 40 |
# Create the entrypoint script
|
| 41 |
RUN echo '#!/bin/bash' > /app/entrypoint.sh && \
|
| 42 |
echo 'echo "Starting AB Testing RAG Agent"' >> /app/entrypoint.sh && \
|
|
|
|
| 43 |
echo 'if [ ! -f "processed_data/document_chunks.pkl" ] || [ ! -d "processed_data/qdrant_vectorstore" ]; then' >> /app/entrypoint.sh && \
|
| 44 |
-
echo ' echo "
|
| 45 |
-
echo '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
echo 'else' >> /app/entrypoint.sh && \
|
| 47 |
echo ' echo "Using existing pre-processed data"' >> /app/entrypoint.sh && \
|
| 48 |
echo 'fi' >> /app/entrypoint.sh && \
|
|
|
|
| 30 |
echo '[server]' >> /root/.streamlit/config.toml && \
|
| 31 |
echo 'headless = true' >> /root/.streamlit/config.toml
|
| 32 |
|
| 33 |
+
# Install additional required packages
|
| 34 |
+
RUN pip install --no-cache-dir huggingface_hub datasets
|
| 35 |
+
|
| 36 |
# Set environment variables
|
| 37 |
ENV HOST=0.0.0.0
|
| 38 |
ENV PORT=8000
|
|
|
|
| 43 |
# Create the entrypoint script
|
| 44 |
RUN echo '#!/bin/bash' > /app/entrypoint.sh && \
|
| 45 |
echo 'echo "Starting AB Testing RAG Agent"' >> /app/entrypoint.sh && \
|
| 46 |
+
echo 'echo "Checking for pre-processed data..."' >> /app/entrypoint.sh && \
|
| 47 |
echo 'if [ ! -f "processed_data/document_chunks.pkl" ] || [ ! -d "processed_data/qdrant_vectorstore" ]; then' >> /app/entrypoint.sh && \
|
| 48 |
+
echo ' echo "Pre-processed data not found. Downloading PDFs..."' >> /app/entrypoint.sh && \
|
| 49 |
+
echo ' if [ -n "${HF_TOKEN}" ]; then' >> /app/entrypoint.sh && \
|
| 50 |
+
echo ' python download_pdfs.py' >> /app/entrypoint.sh && \
|
| 51 |
+
echo ' echo "Running preprocessing..."' >> /app/entrypoint.sh && \
|
| 52 |
+
echo ' python scripts/preprocess_data.py' >> /app/entrypoint.sh && \
|
| 53 |
+
echo ' else' >> /app/entrypoint.sh && \
|
| 54 |
+
echo ' echo "Error: HF_TOKEN environment variable is not set. Cannot download PDFs."' >> /app/entrypoint.sh && \
|
| 55 |
+
echo ' echo "Please set the HF_TOKEN environment variable in your Hugging Face Space settings."' >> /app/entrypoint.sh && \
|
| 56 |
+
echo ' exit 1' >> /app/entrypoint.sh && \
|
| 57 |
+
echo ' fi' >> /app/entrypoint.sh && \
|
| 58 |
echo 'else' >> /app/entrypoint.sh && \
|
| 59 |
echo ' echo "Using existing pre-processed data"' >> /app/entrypoint.sh && \
|
| 60 |
echo 'fi' >> /app/entrypoint.sh && \
|
Makefile
CHANGED
|
@@ -4,7 +4,7 @@ setup:
|
|
| 4 |
python -m pip install -r requirements.txt
|
| 5 |
|
| 6 |
run:
|
| 7 |
-
|
| 8 |
|
| 9 |
docker-build:
|
| 10 |
docker build -t ab-testing-rag-agent .
|
|
@@ -19,5 +19,4 @@ docker-compose-down:
|
|
| 19 |
docker-compose down
|
| 20 |
|
| 21 |
clean:
|
| 22 |
-
rm -rf __pycache__
|
| 23 |
-
rm -rf .chainlit
|
|
|
|
| 4 |
python -m pip install -r requirements.txt
|
| 5 |
|
| 6 |
run:
|
| 7 |
+
streamlit run streamlit_app.py
|
| 8 |
|
| 9 |
docker-build:
|
| 10 |
docker build -t ab-testing-rag-agent .
|
|
|
|
| 19 |
docker-compose down
|
| 20 |
|
| 21 |
clean:
|
| 22 |
+
rm -rf __pycache__
|
|
|
README.md
CHANGED
|
@@ -79,7 +79,29 @@ git remote add hf https://huggingface.co/spaces/yourusername/ab-testing-rag
|
|
| 79 |
git push hf main
|
| 80 |
```
|
| 81 |
|
| 82 |
-
3. Set
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
## Architecture
|
| 85 |
|
|
|
|
| 79 |
git push hf main
|
| 80 |
```
|
| 81 |
|
| 82 |
+
3. Set both required environment variables in the Hugging Face Space settings:
|
| 83 |
+
- `OPENAI_API_KEY`: Your OpenAI API key
|
| 84 |
+
- `HF_TOKEN`: Your Hugging Face token with access to the dataset
|
| 85 |
+
|
| 86 |
+
### Setting Up The PDF Dataset on Hugging Face
|
| 87 |
+
|
| 88 |
+
The deployment uses PDFs stored in a separate Hugging Face dataset repo. To set up your own:
|
| 89 |
+
|
| 90 |
+
1. Create a dataset repository on Hugging Face called `yourusername/ab_testing_pdfs`
|
| 91 |
+
|
| 92 |
+
2. Upload all your PDF files to this repository via the Hugging Face UI or git:
|
| 93 |
+
```bash
|
| 94 |
+
git clone https://huggingface.co/datasets/yourusername/ab_testing_pdfs
|
| 95 |
+
cd ab_testing_pdfs
|
| 96 |
+
cp /path/to/your/pdfs/*.pdf .
|
| 97 |
+
git add .
|
| 98 |
+
git commit -m "Add AB Testing PDFs"
|
| 99 |
+
git push
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
3. Update the dataset name in `download_pdfs.py` if you used a different repository name
|
| 103 |
+
|
| 104 |
+
4. Make sure your `HF_TOKEN` has read access to this dataset repository
|
| 105 |
|
| 106 |
## Architecture
|
| 107 |
|
download_pdfs.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datasets import load_dataset
|
| 2 |
+
import os
|
| 3 |
+
from huggingface_hub import hf_hub_download
|
| 4 |
+
import logging
|
| 5 |
+
|
| 6 |
+
# Configure logging
|
| 7 |
+
logging.basicConfig(level=logging.INFO)
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
def download_pdfs():
|
| 11 |
+
"""
|
| 12 |
+
Download PDF files from the Hugging Face dataset.
|
| 13 |
+
"""
|
| 14 |
+
logger.info("Creating data directory if it doesn't exist")
|
| 15 |
+
os.makedirs("data", exist_ok=True)
|
| 16 |
+
|
| 17 |
+
try:
|
| 18 |
+
logger.info("Loading the dataset from kamkol/ab_testing_pdfs")
|
| 19 |
+
# Try to load the dataset first
|
| 20 |
+
dataset = load_dataset("kamkol/ab_testing_pdfs", use_auth_token=True)
|
| 21 |
+
|
| 22 |
+
# Check if we have files in the dataset
|
| 23 |
+
if 'train' in dataset and len(dataset['train']) > 0:
|
| 24 |
+
logger.info(f"Found {len(dataset['train'])} files in dataset")
|
| 25 |
+
|
| 26 |
+
# Handle dataset format that uses binary field
|
| 27 |
+
if 'binary' in dataset['train'].features:
|
| 28 |
+
for i, item in enumerate(dataset['train']):
|
| 29 |
+
filename = item["filename"] if "filename" in item else f"document_{i}.pdf"
|
| 30 |
+
with open(f"data/{filename}", "wb") as f:
|
| 31 |
+
f.write(item["binary"])
|
| 32 |
+
logger.info(f"Downloaded: {filename}")
|
| 33 |
+
|
| 34 |
+
# Alternative approach for direct file access
|
| 35 |
+
else:
|
| 36 |
+
# List all PDF files in the repository
|
| 37 |
+
logger.info("Dataset doesn't have binary field, trying direct file download")
|
| 38 |
+
for i, item in enumerate(dataset['train']):
|
| 39 |
+
# Get filename from the dataset if available
|
| 40 |
+
if 'filename' in item:
|
| 41 |
+
filename = item['filename']
|
| 42 |
+
else:
|
| 43 |
+
logger.warning(f"No filename found for item {i}, using default")
|
| 44 |
+
filename = f"document_{i}.pdf"
|
| 45 |
+
|
| 46 |
+
# Download the file
|
| 47 |
+
try:
|
| 48 |
+
file_path = hf_hub_download(
|
| 49 |
+
repo_id="kamkol/ab_testing_pdfs",
|
| 50 |
+
filename=filename,
|
| 51 |
+
repo_type="dataset",
|
| 52 |
+
use_auth_token=True
|
| 53 |
+
)
|
| 54 |
+
# Copy to data directory
|
| 55 |
+
with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
|
| 56 |
+
dst.write(src.read())
|
| 57 |
+
logger.info(f"Downloaded: {filename}")
|
| 58 |
+
except Exception as e:
|
| 59 |
+
logger.error(f"Error downloading {filename}: {str(e)}")
|
| 60 |
+
|
| 61 |
+
else:
|
| 62 |
+
# Fall back to direct file download from the repository
|
| 63 |
+
logger.info("No files found in dataset train split, trying direct repository access")
|
| 64 |
+
|
| 65 |
+
# List of AB Testing PDF files - use the exact filenames from your data directory
|
| 66 |
+
pdf_files = [
|
| 67 |
+
"Shipping Flat Treatments in Online Controlled Experiments.pdf",
|
| 68 |
+
"Companies with really small traffic.pdf",
|
| 69 |
+
"Major Redesigns Usually Fail.pdf",
|
| 70 |
+
"Capping Metrics Linkedin Post.pdf",
|
| 71 |
+
"When to Use Bayesian vs Frequentist.pdf",
|
| 72 |
+
"Trustworthy AB Patterns.pdf",
|
| 73 |
+
"Why are Power Calculators Giving Different Results.pdf",
|
| 74 |
+
"Practical Defaults for AB Testing.pdf",
|
| 75 |
+
"P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf",
|
| 76 |
+
"Statistical Challenges in Online Controlled Experiments A Review of A B Testing Methodology.pdf",
|
| 77 |
+
"Top Challenges from the First Practical Online Controlled Experiments Summit.pdf",
|
| 78 |
+
"CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf",
|
| 79 |
+
"TriggeringRuleOfThumb.pdf",
|
| 80 |
+
"AB Testing Intuition Busters.pdf",
|
| 81 |
+
"The Surprising Power of Online Experiments.pdf",
|
| 82 |
+
"What Should the Primary Metric Be for Experimentation Platforms.pdf",
|
| 83 |
+
"Online Controlled Experiments at Large Scale.pdf",
|
| 84 |
+
"Online Controlled Experiments and AB Tests.pdf",
|
| 85 |
+
"Seven Rules of Thumb for Web Site Experimenters.pdf",
|
| 86 |
+
"Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf",
|
| 87 |
+
"Pvalue Misinterpretations Annotated References.pdf",
|
| 88 |
+
"Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf",
|
| 89 |
+
"Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf",
|
| 90 |
+
"False Positives In AB Tests.pdf",
|
| 91 |
+
"emetrics Amazon.pdf",
|
| 92 |
+
"Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf",
|
| 93 |
+
"Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf",
|
| 94 |
+
"Controlled Experiments on the Web Survey and Practical Guide.pdf"
|
| 95 |
+
]
|
| 96 |
+
|
| 97 |
+
for filename in pdf_files:
|
| 98 |
+
try:
|
| 99 |
+
file_path = hf_hub_download(
|
| 100 |
+
repo_id="kamkol/ab_testing_pdfs",
|
| 101 |
+
filename=filename,
|
| 102 |
+
repo_type="dataset",
|
| 103 |
+
use_auth_token=True
|
| 104 |
+
)
|
| 105 |
+
# Copy to data directory
|
| 106 |
+
with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
|
| 107 |
+
dst.write(src.read())
|
| 108 |
+
logger.info(f"Downloaded: {filename}")
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.error(f"Error downloading {filename}: {str(e)}")
|
| 111 |
+
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logger.error(f"Error loading dataset: {str(e)}")
|
| 114 |
+
logger.info("Falling back to direct file download")
|
| 115 |
+
|
| 116 |
+
# List of PDF files to download - same as above
|
| 117 |
+
pdf_files = [
|
| 118 |
+
"Shipping Flat Treatments in Online Controlled Experiments.pdf",
|
| 119 |
+
"Companies with really small traffic.pdf",
|
| 120 |
+
"Major Redesigns Usually Fail.pdf",
|
| 121 |
+
"Capping Metrics Linkedin Post.pdf",
|
| 122 |
+
"When to Use Bayesian vs Frequentist.pdf",
|
| 123 |
+
"Trustworthy AB Patterns.pdf",
|
| 124 |
+
"Why are Power Calculators Giving Different Results.pdf",
|
| 125 |
+
"Practical Defaults for AB Testing.pdf",
|
| 126 |
+
"P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf",
|
| 127 |
+
"Statistical Challenges in Online Controlled Experiments A Review of A B Testing Methodology.pdf",
|
| 128 |
+
"Top Challenges from the First Practical Online Controlled Experiments Summit.pdf",
|
| 129 |
+
"CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf",
|
| 130 |
+
"TriggeringRuleOfThumb.pdf",
|
| 131 |
+
"AB Testing Intuition Busters.pdf",
|
| 132 |
+
"The Surprising Power of Online Experiments.pdf",
|
| 133 |
+
"What Should the Primary Metric Be for Experimentation Platforms.pdf",
|
| 134 |
+
"Online Controlled Experiments at Large Scale.pdf",
|
| 135 |
+
"Online Controlled Experiments and AB Tests.pdf",
|
| 136 |
+
"Seven Rules of Thumb for Web Site Experimenters.pdf",
|
| 137 |
+
"Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf",
|
| 138 |
+
"Pvalue Misinterpretations Annotated References.pdf",
|
| 139 |
+
"Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf",
|
| 140 |
+
"Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf",
|
| 141 |
+
"False Positives In AB Tests.pdf",
|
| 142 |
+
"emetrics Amazon.pdf",
|
| 143 |
+
"Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf",
|
| 144 |
+
"Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf",
|
| 145 |
+
"Controlled Experiments on the Web Survey and Practical Guide.pdf"
|
| 146 |
+
]
|
| 147 |
+
|
| 148 |
+
for filename in pdf_files:
|
| 149 |
+
try:
|
| 150 |
+
file_path = hf_hub_download(
|
| 151 |
+
repo_id="kamkol/ab_testing_pdfs",
|
| 152 |
+
filename=filename,
|
| 153 |
+
repo_type="dataset",
|
| 154 |
+
use_auth_token=True
|
| 155 |
+
)
|
| 156 |
+
# Copy to data directory
|
| 157 |
+
with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
|
| 158 |
+
dst.write(src.read())
|
| 159 |
+
logger.info(f"Downloaded: {filename}")
|
| 160 |
+
except Exception as e:
|
| 161 |
+
logger.error(f"Error downloading {filename}: {str(e)}")
|
| 162 |
+
|
| 163 |
+
if __name__ == "__main__":
|
| 164 |
+
download_pdfs()
|
huggingface-space.yml
CHANGED
|
@@ -9,6 +9,7 @@ license: mit
|
|
| 9 |
|
| 10 |
# Environment variables (fill these in on Hugging Face)
|
| 11 |
# OPENAI_API_KEY: Your OpenAI API Key
|
|
|
|
| 12 |
|
| 13 |
# Configuration for Dockerfile
|
| 14 |
dockerfile:
|
|
|
|
| 9 |
|
| 10 |
# Environment variables (fill these in on Hugging Face)
|
| 11 |
# OPENAI_API_KEY: Your OpenAI API Key
|
| 12 |
+
# HF_TOKEN: Your Hugging Face Token with access to the AB_testing_pdfs dataset
|
| 13 |
|
| 14 |
# Configuration for Dockerfile
|
| 15 |
dockerfile:
|
requirements.txt
CHANGED
|
@@ -11,4 +11,6 @@ python-dotenv==1.0.1
|
|
| 11 |
unstructured==0.12.5
|
| 12 |
pypdf==3.17.4
|
| 13 |
numpy==1.26.3
|
| 14 |
-
requests==2.31.0
|
|
|
|
|
|
|
|
|
| 11 |
unstructured==0.12.5
|
| 12 |
pypdf==3.17.4
|
| 13 |
numpy==1.26.3
|
| 14 |
+
requests==2.31.0
|
| 15 |
+
huggingface_hub==0.20.3
|
| 16 |
+
datasets==2.16.0
|
scripts/prepare_for_deployment.py
CHANGED
|
@@ -82,8 +82,6 @@ def list_deployment_files():
|
|
| 82 |
"requirements.txt",
|
| 83 |
"Dockerfile",
|
| 84 |
"docker-compose.yml",
|
| 85 |
-
"chainlit.md",
|
| 86 |
-
"chainlit.json",
|
| 87 |
"README.md",
|
| 88 |
"scripts/docker-entrypoint.sh",
|
| 89 |
".dockerignore",
|
|
@@ -119,7 +117,7 @@ To deploy to Hugging Face Spaces:
|
|
| 119 |
git remote add huggingface https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
|
| 120 |
|
| 121 |
5. Stage the necessary files (do NOT include PDF files):
|
| 122 |
-
git add app.py requirements.txt Dockerfile docker-compose.yml
|
| 123 |
|
| 124 |
6. Commit the changes:
|
| 125 |
git commit -m "Prepare for Hugging Face deployment with pre-processed data"
|
|
|
|
| 82 |
"requirements.txt",
|
| 83 |
"Dockerfile",
|
| 84 |
"docker-compose.yml",
|
|
|
|
|
|
|
| 85 |
"README.md",
|
| 86 |
"scripts/docker-entrypoint.sh",
|
| 87 |
".dockerignore",
|
|
|
|
| 117 |
git remote add huggingface https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
|
| 118 |
|
| 119 |
5. Stage the necessary files (do NOT include PDF files):
|
| 120 |
+
git add app.py requirements.txt Dockerfile docker-compose.yml README.md scripts/docker-entrypoint.sh .dockerignore processed_data/
|
| 121 |
|
| 122 |
6. Commit the changes:
|
| 123 |
git commit -m "Prepare for Hugging Face deployment with pre-processed data"
|