kamkol commited on
Commit
2585f8a
·
1 Parent(s): 52f5b9a

Better handling large preprocessed data file to Huggingface

Browse files
.dockerignore CHANGED
@@ -44,11 +44,6 @@ notebook_version/
44
  # Do NOT exclude data directory, as we need to process PDFs during docker build if needed
45
  #/data
46
 
47
- # Chainlit-specific
48
- .chainlit
49
- chainlit.md
50
- chainlit.json
51
-
52
  # Misc
53
  .DS_Store
54
 
@@ -83,4 +78,53 @@ ENV/
83
  data/*.pdf
84
 
85
  # Pre-processed data (will be generated inside the container)
86
- processed_data/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  # Do NOT exclude data directory, as we need to process PDFs during docker build if needed
45
  #/data
46
 
 
 
 
 
 
47
  # Misc
48
  .DS_Store
49
 
 
78
  data/*.pdf
79
 
80
  # Pre-processed data (will be generated inside the container)
81
+ processed_data/
82
+
83
+ # Environment and secrets
84
+ .env
85
+ .venv
86
+ venv/
87
+ ENV/
88
+ env/
89
+ .env.local
90
+ .env.development.local
91
+ .env.test.local
92
+ .env.production.local
93
+
94
+ # IDE specific files
95
+ .idea/
96
+ .vscode/
97
+ *.swp
98
+ *.swo
99
+
100
+ # Mac OS
101
+ .DS_Store
102
+
103
+ # Git related
104
+ .git/
105
+ .gitignore
106
+ .github/
107
+
108
+ # Local data - exclude PDFs but include processed data
109
+ /data/*.pdf
110
+ /data/raw/
111
+
112
+ # Debug and log files
113
+ *.log
114
+ debug_*.py
115
+ *_debug.py
116
+ debug_*.log
117
+ run_log.txt
118
+
119
+ # Test related
120
+ .coverage
121
+ .pytest_cache/
122
+ pytest_cache/
123
+ htmlcov/
124
+
125
+
126
+ # Build artifacts
127
+ __pycache__/
128
+ *.py[cod]
129
+ *$py.class
130
+ *.so
.env.example CHANGED
@@ -1,10 +1,12 @@
1
  # OpenAI API key
2
- OPENAI_API_KEY=your_openai_api_key
3
 
4
  # Optional backup key if primary is not set
5
  OPENAI_API_KEY_BACKUP=
6
 
7
- # Chainlit configuration
8
- CHAINLIT_MAX_STEPS_HISTORY=10
9
- CHAINLIT_AUTH_SECRET=
10
- CHAINLIT_SERVER_URL=
 
 
 
1
  # OpenAI API key
2
+ OPENAI_API_KEY=your_openai_api_key_here
3
 
4
  # Optional backup key if primary is not set
5
  OPENAI_API_KEY_BACKUP=
6
 
7
+ # Hugging Face Token - required for downloading PDFs from the Hugging Face dataset
8
+ HF_TOKEN=your_huggingface_token_here
9
+
10
+ # Streamlit configuration (optional)
11
+ # STREAMLIT_SERVER_PORT=8501
12
+ # STREAMLIT_SERVER_HEADLESS=true
.gitattributes CHANGED
@@ -1 +1,40 @@
1
  processed_data/qdrant_vectorstore/collection/kohavi_ab_testing_pdf_collection/storage.sqlite filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  processed_data/qdrant_vectorstore/collection/kohavi_ab_testing_pdf_collection/storage.sqlite filter=lfs diff=lfs merge=lfs -text
2
+
3
+ *.7z filter=lfs diff=lfs merge=lfs -text
4
+ *.arrow filter=lfs diff=lfs merge=lfs -text
5
+ *.bin filter=lfs diff=lfs merge=lfs -text
6
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
7
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
8
+ *.ftz filter=lfs diff=lfs merge=lfs -text
9
+ *.gz filter=lfs diff=lfs merge=lfs -text
10
+ *.h5 filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
13
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
14
+ *.model filter=lfs diff=lfs merge=lfs -text
15
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
16
+ *.npy filter=lfs diff=lfs merge=lfs -text
17
+ *.npz filter=lfs diff=lfs merge=lfs -text
18
+ *.onnx filter=lfs diff=lfs merge=lfs -text
19
+ *.ot filter=lfs diff=lfs merge=lfs -text
20
+ *.parquet filter=lfs diff=lfs merge=lfs -text
21
+ *.pb filter=lfs diff=lfs merge=lfs -text
22
+ *.pickle filter=lfs diff=lfs merge=lfs -text
23
+ *.pkl filter=lfs diff=lfs merge=lfs -text
24
+ *.pt filter=lfs diff=lfs merge=lfs -text
25
+ *.pth filter=lfs diff=lfs merge=lfs -text
26
+ *.rar filter=lfs diff=lfs merge=lfs -text
27
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
28
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
30
+ *.tar filter=lfs diff=lfs merge=lfs -text
31
+ *.tflite filter=lfs diff=lfs merge=lfs -text
32
+ *.tgz filter=lfs diff=lfs merge=lfs -text
33
+ *.wasm filter=lfs diff=lfs merge=lfs -text
34
+ *.xz filter=lfs diff=lfs merge=lfs -text
35
+ *.zip filter=lfs diff=lfs merge=lfs -text
36
+ *.zst filter=lfs diff=lfs merge=lfs -text
37
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
38
+
39
+ # For SQLite database files in vector store
40
+ *.sqlite filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -31,10 +31,6 @@ env/
31
  .env.test.local
32
  .env.production.local
33
 
34
- # Chainlit
35
- .chainlit/
36
- .chainlit/cache/
37
-
38
  # IDE specific files
39
  .idea/
40
  .vscode/
@@ -44,32 +40,13 @@ env/
44
  # Mac OS
45
  .DS_Store
46
 
47
- # Local data - keep PDFs private and never commit them
48
  /data/*.pdf
49
- /data/**/*.pdf
50
 
51
  # Do NOT ignore processed_data anymore - we want to commit this
52
  # /processed_data/
53
 
54
- # Byte-compiled / optimized / DLL files
55
- __pycache__/
56
- *.py[cod]
57
- *$py.class
58
- .DS_Store
59
-
60
- # Distribution / packaging
61
- dist/
62
- build/
63
- *.egg-info/
64
-
65
- # Virtual environment
66
- venv/
67
- env/
68
- .env
69
-
70
- # Environment variables
71
- .env.local
72
-
73
  # Debug and log files
74
  *.log
75
  debug_*.py
@@ -77,24 +54,6 @@ debug_*.py
77
  debug_*.log
78
  run_log.txt
79
 
80
- # Local data
81
- /data/*.pdf
82
- /data/raw/
83
-
84
- # Ignore Chainlit-specific files (using Streamlit instead)
85
- /.chainlit/
86
- chainlit.md
87
- chainlit.json
88
-
89
- # Keep processed data
90
- #/processed_data/
91
-
92
- # IDE and editor files
93
- .idea/
94
- .vscode/
95
- *.swp
96
- *.swo
97
-
98
  # Testing
99
  pytest_cache/
100
  .pytest_cache/
 
31
  .env.test.local
32
  .env.production.local
33
 
 
 
 
 
34
  # IDE specific files
35
  .idea/
36
  .vscode/
 
40
  # Mac OS
41
  .DS_Store
42
 
43
+ # Local data - exclude PDFs but not directory structure
44
  /data/*.pdf
45
+ /data/raw/
46
 
47
  # Do NOT ignore processed_data anymore - we want to commit this
48
  # /processed_data/
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # Debug and log files
51
  *.log
52
  debug_*.py
 
54
  debug_*.log
55
  run_log.txt
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  # Testing
58
  pytest_cache/
59
  .pytest_cache/
Dockerfile CHANGED
@@ -30,6 +30,9 @@ RUN mkdir -p /root/.streamlit && \
30
  echo '[server]' >> /root/.streamlit/config.toml && \
31
  echo 'headless = true' >> /root/.streamlit/config.toml
32
 
 
 
 
33
  # Set environment variables
34
  ENV HOST=0.0.0.0
35
  ENV PORT=8000
@@ -40,9 +43,18 @@ EXPOSE $PORT
40
  # Create the entrypoint script
41
  RUN echo '#!/bin/bash' > /app/entrypoint.sh && \
42
  echo 'echo "Starting AB Testing RAG Agent"' >> /app/entrypoint.sh && \
 
43
  echo 'if [ ! -f "processed_data/document_chunks.pkl" ] || [ ! -d "processed_data/qdrant_vectorstore" ]; then' >> /app/entrypoint.sh && \
44
- echo ' echo "No pre-processed data found. Running preprocessing..."' >> /app/entrypoint.sh && \
45
- echo ' python scripts/preprocess_data.py' >> /app/entrypoint.sh && \
 
 
 
 
 
 
 
 
46
  echo 'else' >> /app/entrypoint.sh && \
47
  echo ' echo "Using existing pre-processed data"' >> /app/entrypoint.sh && \
48
  echo 'fi' >> /app/entrypoint.sh && \
 
30
  echo '[server]' >> /root/.streamlit/config.toml && \
31
  echo 'headless = true' >> /root/.streamlit/config.toml
32
 
33
+ # Install additional required packages
34
+ RUN pip install --no-cache-dir huggingface_hub datasets
35
+
36
  # Set environment variables
37
  ENV HOST=0.0.0.0
38
  ENV PORT=8000
 
43
  # Create the entrypoint script
44
  RUN echo '#!/bin/bash' > /app/entrypoint.sh && \
45
  echo 'echo "Starting AB Testing RAG Agent"' >> /app/entrypoint.sh && \
46
+ echo 'echo "Checking for pre-processed data..."' >> /app/entrypoint.sh && \
47
  echo 'if [ ! -f "processed_data/document_chunks.pkl" ] || [ ! -d "processed_data/qdrant_vectorstore" ]; then' >> /app/entrypoint.sh && \
48
+ echo ' echo "Pre-processed data not found. Downloading PDFs..."' >> /app/entrypoint.sh && \
49
+ echo ' if [ -n "${HF_TOKEN}" ]; then' >> /app/entrypoint.sh && \
50
+ echo ' python download_pdfs.py' >> /app/entrypoint.sh && \
51
+ echo ' echo "Running preprocessing..."' >> /app/entrypoint.sh && \
52
+ echo ' python scripts/preprocess_data.py' >> /app/entrypoint.sh && \
53
+ echo ' else' >> /app/entrypoint.sh && \
54
+ echo ' echo "Error: HF_TOKEN environment variable is not set. Cannot download PDFs."' >> /app/entrypoint.sh && \
55
+ echo ' echo "Please set the HF_TOKEN environment variable in your Hugging Face Space settings."' >> /app/entrypoint.sh && \
56
+ echo ' exit 1' >> /app/entrypoint.sh && \
57
+ echo ' fi' >> /app/entrypoint.sh && \
58
  echo 'else' >> /app/entrypoint.sh && \
59
  echo ' echo "Using existing pre-processed data"' >> /app/entrypoint.sh && \
60
  echo 'fi' >> /app/entrypoint.sh && \
Makefile CHANGED
@@ -4,7 +4,7 @@ setup:
4
  python -m pip install -r requirements.txt
5
 
6
  run:
7
- chainlit run app.py
8
 
9
  docker-build:
10
  docker build -t ab-testing-rag-agent .
@@ -19,5 +19,4 @@ docker-compose-down:
19
  docker-compose down
20
 
21
  clean:
22
- rm -rf __pycache__
23
- rm -rf .chainlit
 
4
  python -m pip install -r requirements.txt
5
 
6
  run:
7
+ streamlit run streamlit_app.py
8
 
9
  docker-build:
10
  docker build -t ab-testing-rag-agent .
 
19
  docker-compose down
20
 
21
  clean:
22
+ rm -rf __pycache__
 
README.md CHANGED
@@ -79,7 +79,29 @@ git remote add hf https://huggingface.co/spaces/yourusername/ab-testing-rag
79
  git push hf main
80
  ```
81
 
82
- 3. Set your OPENAI_API_KEY in the Hugging Face Space settings.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  ## Architecture
85
 
 
79
  git push hf main
80
  ```
81
 
82
+ 3. Set both required environment variables in the Hugging Face Space settings:
83
+ - `OPENAI_API_KEY`: Your OpenAI API key
84
+ - `HF_TOKEN`: Your Hugging Face token with access to the dataset
85
+
86
+ ### Setting Up The PDF Dataset on Hugging Face
87
+
88
+ The deployment uses PDFs stored in a separate Hugging Face dataset repo. To set up your own:
89
+
90
+ 1. Create a dataset repository on Hugging Face called `yourusername/ab_testing_pdfs`
91
+
92
+ 2. Upload all your PDF files to this repository via the Hugging Face UI or git:
93
+ ```bash
94
+ git clone https://huggingface.co/datasets/yourusername/ab_testing_pdfs
95
+ cd ab_testing_pdfs
96
+ cp /path/to/your/pdfs/*.pdf .
97
+ git add .
98
+ git commit -m "Add AB Testing PDFs"
99
+ git push
100
+ ```
101
+
102
+ 3. Update the dataset name in `download_pdfs.py` if you used a different repository name
103
+
104
+ 4. Make sure your `HF_TOKEN` has read access to this dataset repository
105
 
106
  ## Architecture
107
 
download_pdfs.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ import os
3
+ from huggingface_hub import hf_hub_download
4
+ import logging
5
+
6
+ # Configure logging
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+ def download_pdfs():
11
+ """
12
+ Download PDF files from the Hugging Face dataset.
13
+ """
14
+ logger.info("Creating data directory if it doesn't exist")
15
+ os.makedirs("data", exist_ok=True)
16
+
17
+ try:
18
+ logger.info("Loading the dataset from kamkol/ab_testing_pdfs")
19
+ # Try to load the dataset first
20
+ dataset = load_dataset("kamkol/ab_testing_pdfs", use_auth_token=True)
21
+
22
+ # Check if we have files in the dataset
23
+ if 'train' in dataset and len(dataset['train']) > 0:
24
+ logger.info(f"Found {len(dataset['train'])} files in dataset")
25
+
26
+ # Handle dataset format that uses binary field
27
+ if 'binary' in dataset['train'].features:
28
+ for i, item in enumerate(dataset['train']):
29
+ filename = item["filename"] if "filename" in item else f"document_{i}.pdf"
30
+ with open(f"data/{filename}", "wb") as f:
31
+ f.write(item["binary"])
32
+ logger.info(f"Downloaded: {filename}")
33
+
34
+ # Alternative approach for direct file access
35
+ else:
36
+ # List all PDF files in the repository
37
+ logger.info("Dataset doesn't have binary field, trying direct file download")
38
+ for i, item in enumerate(dataset['train']):
39
+ # Get filename from the dataset if available
40
+ if 'filename' in item:
41
+ filename = item['filename']
42
+ else:
43
+ logger.warning(f"No filename found for item {i}, using default")
44
+ filename = f"document_{i}.pdf"
45
+
46
+ # Download the file
47
+ try:
48
+ file_path = hf_hub_download(
49
+ repo_id="kamkol/ab_testing_pdfs",
50
+ filename=filename,
51
+ repo_type="dataset",
52
+ use_auth_token=True
53
+ )
54
+ # Copy to data directory
55
+ with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
56
+ dst.write(src.read())
57
+ logger.info(f"Downloaded: {filename}")
58
+ except Exception as e:
59
+ logger.error(f"Error downloading {filename}: {str(e)}")
60
+
61
+ else:
62
+ # Fall back to direct file download from the repository
63
+ logger.info("No files found in dataset train split, trying direct repository access")
64
+
65
+ # List of AB Testing PDF files - use the exact filenames from your data directory
66
+ pdf_files = [
67
+ "Shipping Flat Treatments in Online Controlled Experiments.pdf",
68
+ "Companies with really small traffic.pdf",
69
+ "Major Redesigns Usually Fail.pdf",
70
+ "Capping Metrics Linkedin Post.pdf",
71
+ "When to Use Bayesian vs Frequentist.pdf",
72
+ "Trustworthy AB Patterns.pdf",
73
+ "Why are Power Calculators Giving Different Results.pdf",
74
+ "Practical Defaults for AB Testing.pdf",
75
+ "P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf",
76
+ "Statistical Challenges in Online Controlled Experiments A Review of A B Testing Methodology.pdf",
77
+ "Top Challenges from the First Practical Online Controlled Experiments Summit.pdf",
78
+ "CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf",
79
+ "TriggeringRuleOfThumb.pdf",
80
+ "AB Testing Intuition Busters.pdf",
81
+ "The Surprising Power of Online Experiments.pdf",
82
+ "What Should the Primary Metric Be for Experimentation Platforms.pdf",
83
+ "Online Controlled Experiments at Large Scale.pdf",
84
+ "Online Controlled Experiments and AB Tests.pdf",
85
+ "Seven Rules of Thumb for Web Site Experimenters.pdf",
86
+ "Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf",
87
+ "Pvalue Misinterpretations Annotated References.pdf",
88
+ "Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf",
89
+ "Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf",
90
+ "False Positives In AB Tests.pdf",
91
+ "emetrics Amazon.pdf",
92
+ "Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf",
93
+ "Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf",
94
+ "Controlled Experiments on the Web Survey and Practical Guide.pdf"
95
+ ]
96
+
97
+ for filename in pdf_files:
98
+ try:
99
+ file_path = hf_hub_download(
100
+ repo_id="kamkol/ab_testing_pdfs",
101
+ filename=filename,
102
+ repo_type="dataset",
103
+ use_auth_token=True
104
+ )
105
+ # Copy to data directory
106
+ with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
107
+ dst.write(src.read())
108
+ logger.info(f"Downloaded: {filename}")
109
+ except Exception as e:
110
+ logger.error(f"Error downloading {filename}: {str(e)}")
111
+
112
+ except Exception as e:
113
+ logger.error(f"Error loading dataset: {str(e)}")
114
+ logger.info("Falling back to direct file download")
115
+
116
+ # List of PDF files to download - same as above
117
+ pdf_files = [
118
+ "Shipping Flat Treatments in Online Controlled Experiments.pdf",
119
+ "Companies with really small traffic.pdf",
120
+ "Major Redesigns Usually Fail.pdf",
121
+ "Capping Metrics Linkedin Post.pdf",
122
+ "When to Use Bayesian vs Frequentist.pdf",
123
+ "Trustworthy AB Patterns.pdf",
124
+ "Why are Power Calculators Giving Different Results.pdf",
125
+ "Practical Defaults for AB Testing.pdf",
126
+ "P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf",
127
+ "Statistical Challenges in Online Controlled Experiments A Review of A B Testing Methodology.pdf",
128
+ "Top Challenges from the First Practical Online Controlled Experiments Summit.pdf",
129
+ "CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf",
130
+ "TriggeringRuleOfThumb.pdf",
131
+ "AB Testing Intuition Busters.pdf",
132
+ "The Surprising Power of Online Experiments.pdf",
133
+ "What Should the Primary Metric Be for Experimentation Platforms.pdf",
134
+ "Online Controlled Experiments at Large Scale.pdf",
135
+ "Online Controlled Experiments and AB Tests.pdf",
136
+ "Seven Rules of Thumb for Web Site Experimenters.pdf",
137
+ "Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf",
138
+ "Pvalue Misinterpretations Annotated References.pdf",
139
+ "Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf",
140
+ "Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf",
141
+ "False Positives In AB Tests.pdf",
142
+ "emetrics Amazon.pdf",
143
+ "Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf",
144
+ "Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf",
145
+ "Controlled Experiments on the Web Survey and Practical Guide.pdf"
146
+ ]
147
+
148
+ for filename in pdf_files:
149
+ try:
150
+ file_path = hf_hub_download(
151
+ repo_id="kamkol/ab_testing_pdfs",
152
+ filename=filename,
153
+ repo_type="dataset",
154
+ use_auth_token=True
155
+ )
156
+ # Copy to data directory
157
+ with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
158
+ dst.write(src.read())
159
+ logger.info(f"Downloaded: {filename}")
160
+ except Exception as e:
161
+ logger.error(f"Error downloading {filename}: {str(e)}")
162
+
163
+ if __name__ == "__main__":
164
+ download_pdfs()
huggingface-space.yml CHANGED
@@ -9,6 +9,7 @@ license: mit
9
 
10
  # Environment variables (fill these in on Hugging Face)
11
  # OPENAI_API_KEY: Your OpenAI API Key
 
12
 
13
  # Configuration for Dockerfile
14
  dockerfile:
 
9
 
10
  # Environment variables (fill these in on Hugging Face)
11
  # OPENAI_API_KEY: Your OpenAI API Key
12
+ # HF_TOKEN: Your Hugging Face Token with access to the AB_testing_pdfs dataset
13
 
14
  # Configuration for Dockerfile
15
  dockerfile:
requirements.txt CHANGED
@@ -11,4 +11,6 @@ python-dotenv==1.0.1
11
  unstructured==0.12.5
12
  pypdf==3.17.4
13
  numpy==1.26.3
14
- requests==2.31.0
 
 
 
11
  unstructured==0.12.5
12
  pypdf==3.17.4
13
  numpy==1.26.3
14
+ requests==2.31.0
15
+ huggingface_hub==0.20.3
16
+ datasets==2.16.0
scripts/prepare_for_deployment.py CHANGED
@@ -82,8 +82,6 @@ def list_deployment_files():
82
  "requirements.txt",
83
  "Dockerfile",
84
  "docker-compose.yml",
85
- "chainlit.md",
86
- "chainlit.json",
87
  "README.md",
88
  "scripts/docker-entrypoint.sh",
89
  ".dockerignore",
@@ -119,7 +117,7 @@ To deploy to Hugging Face Spaces:
119
  git remote add huggingface https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
120
 
121
  5. Stage the necessary files (do NOT include PDF files):
122
- git add app.py requirements.txt Dockerfile docker-compose.yml chainlit.md chainlit.json README.md scripts/docker-entrypoint.sh .dockerignore processed_data/
123
 
124
  6. Commit the changes:
125
  git commit -m "Prepare for Hugging Face deployment with pre-processed data"
 
82
  "requirements.txt",
83
  "Dockerfile",
84
  "docker-compose.yml",
 
 
85
  "README.md",
86
  "scripts/docker-entrypoint.sh",
87
  ".dockerignore",
 
117
  git remote add huggingface https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
118
 
119
  5. Stage the necessary files (do NOT include PDF files):
120
+ git add app.py requirements.txt Dockerfile docker-compose.yml README.md scripts/docker-entrypoint.sh .dockerignore processed_data/
121
 
122
  6. Commit the changes:
123
  git commit -m "Prepare for Hugging Face deployment with pre-processed data"