Spaces:
Sleeping
Sleeping
| """ | |
| Notebook downloader for collecting sample notebooks | |
| """ | |
| import requests | |
| from pathlib import Path | |
| from typing import List | |
| import time | |
| import json | |
| class NotebookDownloader: | |
| """Download sample notebooks from GitHub.""" | |
| def __init__(self, output_dir: str): | |
| self.output_dir = Path(output_dir) | |
| self.output_dir.mkdir(exist_ok=True) | |
| def download_all(self) -> List[str]: | |
| """Download notebooks from various sources.""" | |
| downloaded = [] | |
| # Download from predefined sources | |
| sources = [ | |
| self._download_from_github, | |
| ] | |
| for source_func in sources: | |
| try: | |
| notebooks = source_func() | |
| downloaded.extend(notebooks) | |
| except Exception as e: | |
| print(f"Error downloading from source: {e}") | |
| return downloaded | |
| def _download_from_github(self) -> List[str]: | |
| """Download notebooks from GitHub repositories.""" | |
| repos = [ | |
| "pandas-dev/pandas", | |
| "matplotlib/matplotlib", | |
| "scikit-learn/scikit-learn", | |
| "statsmodels/statsmodels" | |
| ] | |
| downloaded = [] | |
| for repo in repos: | |
| try: | |
| print(f"Fetching from {repo}...") | |
| notebooks = self._search_github_notebooks(repo) | |
| for nb_url, nb_name in notebooks[:2]: # Limit per repo | |
| try: | |
| self._download_notebook(nb_url, nb_name) | |
| downloaded.append(nb_name) | |
| time.sleep(1) # Rate limiting | |
| except Exception as e: | |
| print(f"Failed to download {nb_name}: {e}") | |
| except Exception as e: | |
| print(f"Failed to fetch from {repo}: {e}") | |
| return downloaded | |
| def _search_github_notebooks(self, repo: str) -> List[tuple]: | |
| """Search for notebooks in a GitHub repo.""" | |
| # This is a simplified version - in practice, you'd use GitHub API | |
| # For now, return some known notebook URLs | |
| known_notebooks = { | |
| "pandas-dev/pandas": [ | |
| ("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/source/user_guide/10min.ipynb", "pandas_10min.ipynb") | |
| ], | |
| "matplotlib/matplotlib": [ | |
| ("https://raw.githubusercontent.com/matplotlib/matplotlib/main/tutorials/introductory/sample_plots.ipynb", "matplotlib_sample.ipynb") | |
| ], | |
| "scikit-learn/scikit-learn": [ | |
| ("https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/examples/linear_model/plot_ols.ipynb", "sklearn_ols.ipynb") | |
| ] | |
| } | |
| return known_notebooks.get(repo, []) | |
| def _download_notebook(self, url: str, filename: str): | |
| """Download a single notebook.""" | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| # Validate it's a notebook | |
| try: | |
| data = response.json() | |
| if 'cells' not in data: | |
| raise ValueError("Not a valid notebook") | |
| except: | |
| raise ValueError("Invalid notebook format") | |
| output_path = self.output_dir / filename | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| json.dump(data, f, indent=1) | |
| print(f"✓ Downloaded: {filename}") |