mozzic commited on
Commit
d788958
·
verified ·
1 Parent(s): 6a8cd1b

Upload src\notebook_downloader.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src//notebook_downloader.py +100 -0
src//notebook_downloader.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Notebook downloader for collecting sample notebooks
3
+ """
4
+
5
+ import requests
6
+ from pathlib import Path
7
+ from typing import List
8
+ import time
9
+ import json
10
+
11
+
12
+ class NotebookDownloader:
13
+ """Download sample notebooks from GitHub."""
14
+
15
+ def __init__(self, output_dir: str):
16
+ self.output_dir = Path(output_dir)
17
+ self.output_dir.mkdir(exist_ok=True)
18
+
19
+ def download_all(self) -> List[str]:
20
+ """Download notebooks from various sources."""
21
+ downloaded = []
22
+
23
+ # Download from predefined sources
24
+ sources = [
25
+ self._download_from_github,
26
+ ]
27
+
28
+ for source_func in sources:
29
+ try:
30
+ notebooks = source_func()
31
+ downloaded.extend(notebooks)
32
+ except Exception as e:
33
+ print(f"Error downloading from source: {e}")
34
+
35
+ return downloaded
36
+
37
+ def _download_from_github(self) -> List[str]:
38
+ """Download notebooks from GitHub repositories."""
39
+ repos = [
40
+ "pandas-dev/pandas",
41
+ "matplotlib/matplotlib",
42
+ "scikit-learn/scikit-learn",
43
+ "statsmodels/statsmodels"
44
+ ]
45
+
46
+ downloaded = []
47
+
48
+ for repo in repos:
49
+ try:
50
+ print(f"Fetching from {repo}...")
51
+ notebooks = self._search_github_notebooks(repo)
52
+ for nb_url, nb_name in notebooks[:2]: # Limit per repo
53
+ try:
54
+ self._download_notebook(nb_url, nb_name)
55
+ downloaded.append(nb_name)
56
+ time.sleep(1) # Rate limiting
57
+ except Exception as e:
58
+ print(f"Failed to download {nb_name}: {e}")
59
+ except Exception as e:
60
+ print(f"Failed to fetch from {repo}: {e}")
61
+
62
+ return downloaded
63
+
64
+ def _search_github_notebooks(self, repo: str) -> List[tuple]:
65
+ """Search for notebooks in a GitHub repo."""
66
+ # This is a simplified version - in practice, you'd use GitHub API
67
+ # For now, return some known notebook URLs
68
+
69
+ known_notebooks = {
70
+ "pandas-dev/pandas": [
71
+ ("https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/source/user_guide/10min.ipynb", "pandas_10min.ipynb")
72
+ ],
73
+ "matplotlib/matplotlib": [
74
+ ("https://raw.githubusercontent.com/matplotlib/matplotlib/main/tutorials/introductory/sample_plots.ipynb", "matplotlib_sample.ipynb")
75
+ ],
76
+ "scikit-learn/scikit-learn": [
77
+ ("https://raw.githubusercontent.com/scikit-learn/scikit-learn/main/examples/linear_model/plot_ols.ipynb", "sklearn_ols.ipynb")
78
+ ]
79
+ }
80
+
81
+ return known_notebooks.get(repo, [])
82
+
83
+ def _download_notebook(self, url: str, filename: str):
84
+ """Download a single notebook."""
85
+ response = requests.get(url, timeout=10)
86
+ response.raise_for_status()
87
+
88
+ # Validate it's a notebook
89
+ try:
90
+ data = response.json()
91
+ if 'cells' not in data:
92
+ raise ValueError("Not a valid notebook")
93
+ except:
94
+ raise ValueError("Invalid notebook format")
95
+
96
+ output_path = self.output_dir / filename
97
+ with open(output_path, 'w', encoding='utf-8') as f:
98
+ json.dump(data, f, indent=1)
99
+
100
+ print(f"✓ Downloaded: {filename}")