Krishna172912 commited on
Commit
3246962
Β·
unverified Β·
1 Parent(s): 4073eb5

Create downloader.py

Browse files
Files changed (1) hide show
  1. back_end/core/downloader.py +174 -0
back_end/core/downloader.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import stat
3
+ import zipfile
4
+ import logging
5
+ import io
6
+ from tqdm import tqdm
7
+ import os
8
+ import shutil # to remove folder
9
+ from pathlib import Path
10
+
11
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ # ─────────────────────────── helpers ────────────────────────────
16
+
17
+ def _force_remove(action, name, exc):
18
+ """onerror callback: force-delete read-only files on Windows."""
19
+ try:
20
+ os.chmod(name, stat.S_IWRITE)
21
+ os.remove(name)
22
+ except Exception as e:
23
+ logger.warning(f"Could not remove '{name}': {e}")
24
+
25
+
26
+ def delete_dir(path_) -> None:
27
+ """Recursively delete a directory, handling read-only files."""
28
+ path = Path(path_)
29
+ if path.exists():
30
+ shutil.rmtree(path, onerror=_force_remove)
31
+ logger.info(f"Deleted existing directory: {path}")
32
+
33
+
34
+ def repo_name_from_url(url: str) -> str:
35
+ """Extract a clean repo name from a GitHub URL."""
36
+ # Strip trailing slash / .git suffix
37
+ clean = url.rstrip("/").removesuffix(".git")
38
+ return clean.split("/")[-1]
39
+
40
+
41
+ # ─────────────────────────── core ───────────────────────────────
42
+
43
+ def download_github_repo(
44
+ repo_url: str,
45
+ storage_dir: Path,
46
+ *,
47
+ timeout: int = 30,
48
+ chunk_size: int = 1024,
49
+ overwrite: bool = True,
50
+ ) -> Path:
51
+ """
52
+ Download a GitHub repository as a ZIP and extract it into *storage_dir*.
53
+
54
+ Parameters
55
+ ----------
56
+ repo_url : Full GitHub repo URL, e.g. "https://github.com/user/repo"
57
+ storage_dir : Parent folder that holds all downloaded repos.
58
+ timeout : Seconds before the HTTP connection times out.
59
+ chunk_size : Download chunk size in bytes.
60
+ overwrite : If True, delete an existing copy before re-downloading.
61
+
62
+ Returns
63
+ -------
64
+ Path to the extracted repository directory inside storage_dir.
65
+
66
+ Raises
67
+ ------
68
+ ValueError : Bad URL or missing repo name.
69
+ requests.HTTPError : Non-200 response from GitHub.
70
+ zipfile.BadZipFile : Corrupted/incomplete download.
71
+ OSError : Filesystem failures.
72
+ """
73
+
74
+ # ── validate URL ──────────────────────────────────────────────
75
+ # Strip URL fragments (#), then whitespace, then trailing slashes
76
+ repo_url = repo_url.split('#')[0].strip().rstrip("/")
77
+ if not repo_url.startswith("https://github.com/"):
78
+ raise ValueError(f"Expected a GitHub URL, got: {repo_url!r}")
79
+
80
+ name = repo_name_from_url(repo_url)
81
+ if not name:
82
+ raise ValueError(f"Could not extract a repo name from URL: {repo_url!r}")
83
+
84
+ # ── prepare destination ───────────────────────────────────────
85
+ storage_dir.mkdir(parents=True, exist_ok=True)
86
+ repo_dest = storage_dir / name
87
+
88
+ if repo_dest.exists():
89
+ if overwrite:
90
+ delete_dir(repo_dest)
91
+ else:
92
+ logger.info(f"Repo already exists and overwrite=False: {repo_dest}")
93
+ return repo_dest
94
+
95
+ # ── download ──────────────────────────────────────────────────
96
+ zip_url = f"{repo_url}/zipball/HEAD"
97
+ logger.info(f"Connecting to GitHub: {zip_url}")
98
+
99
+ try:
100
+ response = requests.get(
101
+ zip_url,
102
+ stream=True,
103
+ timeout=timeout,
104
+ allow_redirects=True,
105
+ )
106
+ response.raise_for_status()
107
+ except requests.exceptions.Timeout:
108
+ raise requests.exceptions.Timeout(
109
+ f"Connection timed out after {timeout}s β€” check your network."
110
+ )
111
+ except requests.exceptions.ConnectionError as e:
112
+ raise requests.exceptions.ConnectionError(
113
+ f"Could not reach GitHub. Is the URL correct? Details: {e}"
114
+ )
115
+
116
+ total_size = int(response.headers.get("content-length", 0))
117
+
118
+ file_stream = io.BytesIO()
119
+ with tqdm(
120
+ total=total_size or None,
121
+ unit="iB",
122
+ unit_scale=True,
123
+ desc=f"Downloading '{name}'",
124
+ ) as bar:
125
+ for chunk in response.iter_content(chunk_size):
126
+ if chunk: # filter keep-alive empty chunks
127
+ file_stream.write(chunk)
128
+ bar.update(len(chunk))
129
+
130
+ downloaded_bytes = file_stream.tell()
131
+ if downloaded_bytes == 0:
132
+ raise ValueError("Download produced an empty file β€” nothing to extract.")
133
+
134
+ logger.info(f"Download complete ({downloaded_bytes / 1024:.1f} KB). Extracting…")
135
+
136
+ # ── validate zip ─────────────────────────────────��────────────
137
+ file_stream.seek(0)
138
+ if not zipfile.is_zipfile(file_stream):
139
+ raise zipfile.BadZipFile("Downloaded content is not a valid ZIP archive.")
140
+
141
+ # ── extract ───────────────────────────────────────────────────
142
+ file_stream.seek(0)
143
+ repo_dest.mkdir(parents=True, exist_ok=True)
144
+
145
+ with zipfile.ZipFile(file_stream) as z:
146
+ members = z.namelist()
147
+ if not members:
148
+ raise zipfile.BadZipFile("ZIP archive is empty.")
149
+
150
+ # GitHub wraps everything in a top-level folder like "user-repo-abc123/"
151
+ # Detect it so we can strip it and land files directly in repo_dest.
152
+ top_level = members[0].split("/")[0] + "/"
153
+
154
+ with tqdm(total=len(members), unit="file", desc="Extracting") as bar:
155
+ for member in members:
156
+ # Strip the GitHub-generated prefix
157
+ relative = member[len(top_level):] if member.startswith(top_level) else member
158
+ if not relative: # skip the top-level dir entry itself
159
+ bar.update(1)
160
+ continue
161
+
162
+ target = repo_dest / relative
163
+
164
+ if member.endswith("/"): # directory entry
165
+ target.mkdir(parents=True, exist_ok=True)
166
+ else:
167
+ target.parent.mkdir(parents=True, exist_ok=True)
168
+ with z.open(member) as src, open(target, "wb") as dst:
169
+ shutil.copyfileobj(src, dst)
170
+
171
+ bar.update(1)
172
+
173
+ logger.info(f"[SUCCESS] Repository extracted to: {repo_dest}")
174
+ return repo_dest