AB_Testing_RAG_Agent / download_pdfs.py
kamkol's picture
Better handling large preprocessed data file to Huggingface
2585f8a
from datasets import load_dataset
import os
from huggingface_hub import hf_hub_download
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def download_pdfs():
"""
Download PDF files from the Hugging Face dataset.
"""
logger.info("Creating data directory if it doesn't exist")
os.makedirs("data", exist_ok=True)
try:
logger.info("Loading the dataset from kamkol/ab_testing_pdfs")
# Try to load the dataset first
dataset = load_dataset("kamkol/ab_testing_pdfs", use_auth_token=True)
# Check if we have files in the dataset
if 'train' in dataset and len(dataset['train']) > 0:
logger.info(f"Found {len(dataset['train'])} files in dataset")
# Handle dataset format that uses binary field
if 'binary' in dataset['train'].features:
for i, item in enumerate(dataset['train']):
filename = item["filename"] if "filename" in item else f"document_{i}.pdf"
with open(f"data/{filename}", "wb") as f:
f.write(item["binary"])
logger.info(f"Downloaded: {filename}")
# Alternative approach for direct file access
else:
# List all PDF files in the repository
logger.info("Dataset doesn't have binary field, trying direct file download")
for i, item in enumerate(dataset['train']):
# Get filename from the dataset if available
if 'filename' in item:
filename = item['filename']
else:
logger.warning(f"No filename found for item {i}, using default")
filename = f"document_{i}.pdf"
# Download the file
try:
file_path = hf_hub_download(
repo_id="kamkol/ab_testing_pdfs",
filename=filename,
repo_type="dataset",
use_auth_token=True
)
# Copy to data directory
with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
dst.write(src.read())
logger.info(f"Downloaded: {filename}")
except Exception as e:
logger.error(f"Error downloading {filename}: {str(e)}")
else:
# Fall back to direct file download from the repository
logger.info("No files found in dataset train split, trying direct repository access")
# List of AB Testing PDF files - use the exact filenames from your data directory
pdf_files = [
"Shipping Flat Treatments in Online Controlled Experiments.pdf",
"Companies with really small traffic.pdf",
"Major Redesigns Usually Fail.pdf",
"Capping Metrics Linkedin Post.pdf",
"When to Use Bayesian vs Frequentist.pdf",
"Trustworthy AB Patterns.pdf",
"Why are Power Calculators Giving Different Results.pdf",
"Practical Defaults for AB Testing.pdf",
"P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf",
"Statistical Challenges in Online Controlled Experiments A Review of A B Testing Methodology.pdf",
"Top Challenges from the First Practical Online Controlled Experiments Summit.pdf",
"CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf",
"TriggeringRuleOfThumb.pdf",
"AB Testing Intuition Busters.pdf",
"The Surprising Power of Online Experiments.pdf",
"What Should the Primary Metric Be for Experimentation Platforms.pdf",
"Online Controlled Experiments at Large Scale.pdf",
"Online Controlled Experiments and AB Tests.pdf",
"Seven Rules of Thumb for Web Site Experimenters.pdf",
"Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf",
"Pvalue Misinterpretations Annotated References.pdf",
"Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf",
"Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf",
"False Positives In AB Tests.pdf",
"emetrics Amazon.pdf",
"Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf",
"Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf",
"Controlled Experiments on the Web Survey and Practical Guide.pdf"
]
for filename in pdf_files:
try:
file_path = hf_hub_download(
repo_id="kamkol/ab_testing_pdfs",
filename=filename,
repo_type="dataset",
use_auth_token=True
)
# Copy to data directory
with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
dst.write(src.read())
logger.info(f"Downloaded: {filename}")
except Exception as e:
logger.error(f"Error downloading {filename}: {str(e)}")
except Exception as e:
logger.error(f"Error loading dataset: {str(e)}")
logger.info("Falling back to direct file download")
# List of PDF files to download - same as above
pdf_files = [
"Shipping Flat Treatments in Online Controlled Experiments.pdf",
"Companies with really small traffic.pdf",
"Major Redesigns Usually Fail.pdf",
"Capping Metrics Linkedin Post.pdf",
"When to Use Bayesian vs Frequentist.pdf",
"Trustworthy AB Patterns.pdf",
"Why are Power Calculators Giving Different Results.pdf",
"Practical Defaults for AB Testing.pdf",
"P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf",
"Statistical Challenges in Online Controlled Experiments A Review of A B Testing Methodology.pdf",
"Top Challenges from the First Practical Online Controlled Experiments Summit.pdf",
"CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf",
"TriggeringRuleOfThumb.pdf",
"AB Testing Intuition Busters.pdf",
"The Surprising Power of Online Experiments.pdf",
"What Should the Primary Metric Be for Experimentation Platforms.pdf",
"Online Controlled Experiments at Large Scale.pdf",
"Online Controlled Experiments and AB Tests.pdf",
"Seven Rules of Thumb for Web Site Experimenters.pdf",
"Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf",
"Pvalue Misinterpretations Annotated References.pdf",
"Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf",
"Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf",
"False Positives In AB Tests.pdf",
"emetrics Amazon.pdf",
"Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf",
"Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf",
"Controlled Experiments on the Web Survey and Practical Guide.pdf"
]
for filename in pdf_files:
try:
file_path = hf_hub_download(
repo_id="kamkol/ab_testing_pdfs",
filename=filename,
repo_type="dataset",
use_auth_token=True
)
# Copy to data directory
with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
dst.write(src.read())
logger.info(f"Downloaded: {filename}")
except Exception as e:
logger.error(f"Error downloading {filename}: {str(e)}")
if __name__ == "__main__":
download_pdfs()