Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| import os | |
| from huggingface_hub import hf_hub_download | |
| import logging | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def download_pdfs(): | |
| """ | |
| Download PDF files from the Hugging Face dataset. | |
| """ | |
| logger.info("Creating data directory if it doesn't exist") | |
| os.makedirs("data", exist_ok=True) | |
| try: | |
| logger.info("Loading the dataset from kamkol/ab_testing_pdfs") | |
| # Try to load the dataset first | |
| dataset = load_dataset("kamkol/ab_testing_pdfs", use_auth_token=True) | |
| # Check if we have files in the dataset | |
| if 'train' in dataset and len(dataset['train']) > 0: | |
| logger.info(f"Found {len(dataset['train'])} files in dataset") | |
| # Handle dataset format that uses binary field | |
| if 'binary' in dataset['train'].features: | |
| for i, item in enumerate(dataset['train']): | |
| filename = item["filename"] if "filename" in item else f"document_{i}.pdf" | |
| with open(f"data/{filename}", "wb") as f: | |
| f.write(item["binary"]) | |
| logger.info(f"Downloaded: {filename}") | |
| # Alternative approach for direct file access | |
| else: | |
| # List all PDF files in the repository | |
| logger.info("Dataset doesn't have binary field, trying direct file download") | |
| for i, item in enumerate(dataset['train']): | |
| # Get filename from the dataset if available | |
| if 'filename' in item: | |
| filename = item['filename'] | |
| else: | |
| logger.warning(f"No filename found for item {i}, using default") | |
| filename = f"document_{i}.pdf" | |
| # Download the file | |
| try: | |
| file_path = hf_hub_download( | |
| repo_id="kamkol/ab_testing_pdfs", | |
| filename=filename, | |
| repo_type="dataset", | |
| use_auth_token=True | |
| ) | |
| # Copy to data directory | |
| with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst: | |
| dst.write(src.read()) | |
| logger.info(f"Downloaded: {filename}") | |
| except Exception as e: | |
| logger.error(f"Error downloading {filename}: {str(e)}") | |
| else: | |
| # Fall back to direct file download from the repository | |
| logger.info("No files found in dataset train split, trying direct repository access") | |
| # List of AB Testing PDF files - use the exact filenames from your data directory | |
| pdf_files = [ | |
| "Shipping Flat Treatments in Online Controlled Experiments.pdf", | |
| "Companies with really small traffic.pdf", | |
| "Major Redesigns Usually Fail.pdf", | |
| "Capping Metrics Linkedin Post.pdf", | |
| "When to Use Bayesian vs Frequentist.pdf", | |
| "Trustworthy AB Patterns.pdf", | |
| "Why are Power Calculators Giving Different Results.pdf", | |
| "Practical Defaults for AB Testing.pdf", | |
| "P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf", | |
| "Statistical Challenges in Online Controlled Experiments A Review of A B Testing Methodology.pdf", | |
| "Top Challenges from the First Practical Online Controlled Experiments Summit.pdf", | |
| "CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf", | |
| "TriggeringRuleOfThumb.pdf", | |
| "AB Testing Intuition Busters.pdf", | |
| "The Surprising Power of Online Experiments.pdf", | |
| "What Should the Primary Metric Be for Experimentation Platforms.pdf", | |
| "Online Controlled Experiments at Large Scale.pdf", | |
| "Online Controlled Experiments and AB Tests.pdf", | |
| "Seven Rules of Thumb for Web Site Experimenters.pdf", | |
| "Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf", | |
| "Pvalue Misinterpretations Annotated References.pdf", | |
| "Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf", | |
| "Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf", | |
| "False Positives In AB Tests.pdf", | |
| "emetrics Amazon.pdf", | |
| "Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf", | |
| "Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf", | |
| "Controlled Experiments on the Web Survey and Practical Guide.pdf" | |
| ] | |
| for filename in pdf_files: | |
| try: | |
| file_path = hf_hub_download( | |
| repo_id="kamkol/ab_testing_pdfs", | |
| filename=filename, | |
| repo_type="dataset", | |
| use_auth_token=True | |
| ) | |
| # Copy to data directory | |
| with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst: | |
| dst.write(src.read()) | |
| logger.info(f"Downloaded: {filename}") | |
| except Exception as e: | |
| logger.error(f"Error downloading {filename}: {str(e)}") | |
| except Exception as e: | |
| logger.error(f"Error loading dataset: {str(e)}") | |
| logger.info("Falling back to direct file download") | |
| # List of PDF files to download - same as above | |
| pdf_files = [ | |
| "Shipping Flat Treatments in Online Controlled Experiments.pdf", | |
| "Companies with really small traffic.pdf", | |
| "Major Redesigns Usually Fail.pdf", | |
| "Capping Metrics Linkedin Post.pdf", | |
| "When to Use Bayesian vs Frequentist.pdf", | |
| "Trustworthy AB Patterns.pdf", | |
| "Why are Power Calculators Giving Different Results.pdf", | |
| "Practical Defaults for AB Testing.pdf", | |
| "P values and Bayes Factors in ABTesting (Frequentist or Bayesian AB).pdf", | |
| "Statistical Challenges in Online Controlled Experiments A Review of A B Testing Methodology.pdf", | |
| "Top Challenges from the First Practical Online Controlled Experiments Summit.pdf", | |
| "CUPED Improving Sensitivity Of Controlled Experiments by Utilizing Pre Experiment Data.pdf", | |
| "TriggeringRuleOfThumb.pdf", | |
| "AB Testing Intuition Busters.pdf", | |
| "The Surprising Power of Online Experiments.pdf", | |
| "What Should the Primary Metric Be for Experimentation Platforms.pdf", | |
| "Online Controlled Experiments at Large Scale.pdf", | |
| "Online Controlled Experiments and AB Tests.pdf", | |
| "Seven Rules of Thumb for Web Site Experimenters.pdf", | |
| "Seven Pitfalls to Avoid when Running Controlled Experiments on the Web.pdf", | |
| "Pvalue Misinterpretations Annotated References.pdf", | |
| "Pitfalls of Long Term Online Controlled Experiments (Holdout group).pdf", | |
| "Multi-Armed Bandits, Thompson Sampling, or A_B Testing_ Are you optimizing for short-term headlines or long-term pills worth billions_ _ LinkedIn.pdf", | |
| "False Positives In AB Tests.pdf", | |
| "emetrics Amazon.pdf", | |
| "Trustworthy Online Controlled Experiments Five Puzzling Outcomes Explained.pdf", | |
| "Trustworthy Online Controlled Experiments - Kohavi, Ron & Tang, Diane & Xu, Ya.pdf", | |
| "Controlled Experiments on the Web Survey and Practical Guide.pdf" | |
| ] | |
| for filename in pdf_files: | |
| try: | |
| file_path = hf_hub_download( | |
| repo_id="kamkol/ab_testing_pdfs", | |
| filename=filename, | |
| repo_type="dataset", | |
| use_auth_token=True | |
| ) | |
| # Copy to data directory | |
| with open(file_path, 'rb') as src, open(f"data/{filename}", 'wb') as dst: | |
| dst.write(src.read()) | |
| logger.info(f"Downloaded: {filename}") | |
| except Exception as e: | |
| logger.error(f"Error downloading {filename}: {str(e)}") | |
| if __name__ == "__main__": | |
| download_pdfs() |