Spaces:

kamkol
/

AB_Testing_RAG_Agent

Sleeping

App Files Files Community

AB_Testing_RAG_Agent / download_pdfs.py

kamkol

Better handling large preprocessed data file to Huggingface

2585f8a 12 months ago

raw

history blame contribute delete

8.79 kB

	from datasets import load_dataset
	import os
	from huggingface_hub import hf_hub_download
	import logging

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def download_pdfs():
	"""
	Download PDF files from the Hugging Face dataset.
	"""
	logger.info("Creating data directory if it doesn't exist")
	os.makedirs("data", exist_ok=True)

	try:
	logger.info("Loading the dataset from kamkol/ab_testing_pdfs")
	# Try to load the dataset first
	dataset = load_dataset("kamkol/ab_testing_pdfs", use_auth_token=True)

	# Check if we have files in the dataset
	if 'train' in dataset and len(dataset['train']) > 0:
	logger.info(f"Found {len(dataset['train'])} files in dataset")

	# Handle dataset format that uses binary field
	if 'binary' in dataset['train'].features:
	for i, item in enumerate(dataset['train']):
	filename = item["filename"] if "filename" in item else f"document_{i}.pdf"
	with open(f"data/{filename}", "wb") as f:
	f.write(item["binary"])
	logger.info(f"Downloaded: {filename}")

	# Alternative approach for direct file access
	else:
	# List all PDF files in the repository
	logger.info("Dataset doesn't have binary field, trying direct file download")
	for i, item in enumerate(dataset['train']):
	# Get filename from the dataset if available
	if 'filename' in item:
	filename = item['filename']
	else:
	logger.warning(f"No filename found for item {i}, using default")
	filename = f"document_{i}.pdf"

	# Download the file
	try:
	file_path = hf_hub_download(
	repo_id="kamkol/ab_testing_pdfs",
	filename=filename,
	repo_type="dataset",
	use_auth_token=True
	)
	# Copy to data directory
	with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
	dst.write(src.read())
	logger.info(f"Downloaded: {filename}")
	except Exception as e:
	logger.error(f"Error downloading {filename}: {str(e)}")

	else:
	# Fall back to direct file download from the repository
	logger.info("No files found in dataset train split, trying direct repository access")

	# List of AB Testing PDF files - use the exact filenames from your data directory
	pdf_files = [
	"Shipping Flat Treatments in Online Controlled Experiments.pdf",
	"Companies with really small traffic.pdf",
	"Major Redesigns Usually Fail.pdf",
	"Capping Metrics Linkedin Post.pdf",
	"When to Use Bayesian vs Frequentist.pdf",
	"Trustworthy AB Patterns.pdf",
	"Why are Power Calculators Giving Different Results.pdf",
	"Practical Defaults for AB Testing.pdf",
	"P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf",
	"Statistical Challenges in Online Controlled Experiments A Review of A B Testing Methodology.pdf",
	"Top Challenges from the First Practical Online Controlled Experiments Summit.pdf",
	"CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf",
	"TriggeringRuleOfThumb.pdf",
	"AB Testing Intuition Busters.pdf",
	"The Surprising Power of Online Experiments.pdf",
	"What Should the Primary Metric Be for Experimentation Platforms.pdf",
	"Online Controlled Experiments at Large Scale.pdf",
	"Online Controlled Experiments and AB Tests.pdf",
	"Seven Rules of Thumb for Web Site Experimenters.pdf",
	"Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf",
	"Pvalue Misinterpretations Annotated References.pdf",
	"Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf",
	"Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf",
	"False Positives In AB Tests.pdf",
	"emetrics Amazon.pdf",
	"Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf",
	"Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf",
	"Controlled Experiments on the Web Survey and Practical Guide.pdf"
	]

	for filename in pdf_files:
	try:
	file_path = hf_hub_download(
	repo_id="kamkol/ab_testing_pdfs",
	filename=filename,
	repo_type="dataset",
	use_auth_token=True
	)
	# Copy to data directory
	with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
	dst.write(src.read())
	logger.info(f"Downloaded: {filename}")
	except Exception as e:
	logger.error(f"Error downloading {filename}: {str(e)}")

	except Exception as e:
	logger.error(f"Error loading dataset: {str(e)}")
	logger.info("Falling back to direct file download")

	# List of PDF files to download - same as above
	pdf_files = [
	"Shipping Flat Treatments in Online Controlled Experiments.pdf",
	"Companies with really small traffic.pdf",
	"Major Redesigns Usually Fail.pdf",
	"Capping Metrics Linkedin Post.pdf",
	"When to Use Bayesian vs Frequentist.pdf",
	"Trustworthy AB Patterns.pdf",
	"Why are Power Calculators Giving Different Results.pdf",
	"Practical Defaults for AB Testing.pdf",
	"P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf",
	"Statistical Challenges in Online Controlled Experiments A Review of A B Testing Methodology.pdf",
	"Top Challenges from the First Practical Online Controlled Experiments Summit.pdf",
	"CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf",
	"TriggeringRuleOfThumb.pdf",
	"AB Testing Intuition Busters.pdf",
	"The Surprising Power of Online Experiments.pdf",
	"What Should the Primary Metric Be for Experimentation Platforms.pdf",
	"Online Controlled Experiments at Large Scale.pdf",
	"Online Controlled Experiments and AB Tests.pdf",
	"Seven Rules of Thumb for Web Site Experimenters.pdf",
	"Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf",
	"Pvalue Misinterpretations Annotated References.pdf",
	"Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf",
	"Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf",
	"False Positives In AB Tests.pdf",
	"emetrics Amazon.pdf",
	"Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf",
	"Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf",
	"Controlled Experiments on the Web Survey and Practical Guide.pdf"
	]

	for filename in pdf_files:
	try:
	file_path = hf_hub_download(
	repo_id="kamkol/ab_testing_pdfs",
	filename=filename,
	repo_type="dataset",
	use_auth_token=True
	)
	# Copy to data directory
	with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst:
	dst.write(src.read())
	logger.info(f"Downloaded: {filename}")
	except Exception as e:
	logger.error(f"Error downloading {filename}: {str(e)}")

	if __name__ == "__main__":
	download_pdfs()