Spaces:
Running
Running
Asish Karthikeya Gogineni commited on
Commit ·
c06be9c
1
Parent(s): 82387e1
feat: Add HTTP ZIP download fallback for GitHub repos
Browse files- GitHubRepoManager now tries git clone first, falls back to HTTP ZIP download
- Downloads repo as ZIP from GitHub API if GitPython fails
- Fixes GitHub repo ingestion on Hugging Face Spaces where git may not work
- Handles both 'main' and 'master' branch naming conventions
code_chatbot/universal_ingestor.py
CHANGED
|
@@ -301,11 +301,12 @@ class GitHubRepoManager(DataManager):
|
|
| 301 |
self.path = os.path.join(local_dir, repo_id.replace("/", "_"))
|
| 302 |
|
| 303 |
def download(self) -> bool:
|
| 304 |
-
"""Clones the GitHub repository."""
|
| 305 |
if os.path.exists(self.path) and os.listdir(self.path):
|
| 306 |
logger.info(f"Repo already cloned at {self.path}")
|
| 307 |
return True
|
| 308 |
|
|
|
|
| 309 |
try:
|
| 310 |
from git import Repo, GitCommandError
|
| 311 |
|
|
@@ -325,10 +326,60 @@ class GitHubRepoManager(DataManager):
|
|
| 325 |
logger.info(f"Cloned {self.repo_id} to {self.path}")
|
| 326 |
return True
|
| 327 |
except ImportError:
|
| 328 |
-
logger.
|
| 329 |
-
raise
|
| 330 |
except Exception as e:
|
| 331 |
-
logger.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 332 |
return False
|
| 333 |
|
| 334 |
def walk(self, get_content: bool = True) -> Generator[Tuple[Any, Dict], None, None]:
|
|
|
|
| 301 |
self.path = os.path.join(local_dir, repo_id.replace("/", "_"))
|
| 302 |
|
| 303 |
def download(self) -> bool:
|
| 304 |
+
"""Clones the GitHub repository. Falls back to HTTP ZIP download if git fails."""
|
| 305 |
if os.path.exists(self.path) and os.listdir(self.path):
|
| 306 |
logger.info(f"Repo already cloned at {self.path}")
|
| 307 |
return True
|
| 308 |
|
| 309 |
+
# Try git clone first
|
| 310 |
try:
|
| 311 |
from git import Repo, GitCommandError
|
| 312 |
|
|
|
|
| 326 |
logger.info(f"Cloned {self.repo_id} to {self.path}")
|
| 327 |
return True
|
| 328 |
except ImportError:
|
| 329 |
+
logger.warning("GitPython not available, falling back to HTTP download")
|
|
|
|
| 330 |
except Exception as e:
|
| 331 |
+
logger.warning(f"Git clone failed: {e}, falling back to HTTP download")
|
| 332 |
+
|
| 333 |
+
# Fallback: Download as ZIP via GitHub API
|
| 334 |
+
try:
|
| 335 |
+
return self._download_as_zip()
|
| 336 |
+
except Exception as e:
|
| 337 |
+
logger.error(f"Failed to download {self.repo_id}: {e}")
|
| 338 |
+
return False
|
| 339 |
+
|
| 340 |
+
def _download_as_zip(self) -> bool:
|
| 341 |
+
"""Download repo as ZIP from GitHub API (fallback method)."""
|
| 342 |
+
import io
|
| 343 |
+
|
| 344 |
+
# Download ZIP from GitHub
|
| 345 |
+
zip_url = f"https://github.com/{self.repo_id}/archive/refs/heads/main.zip"
|
| 346 |
+
logger.info(f"Downloading {self.repo_id} as ZIP from {zip_url}")
|
| 347 |
+
|
| 348 |
+
try:
|
| 349 |
+
response = requests.get(zip_url, timeout=60)
|
| 350 |
+
if response.status_code == 404:
|
| 351 |
+
# Try 'master' branch if 'main' doesn't exist
|
| 352 |
+
zip_url = f"https://github.com/{self.repo_id}/archive/refs/heads/master.zip"
|
| 353 |
+
response = requests.get(zip_url, timeout=60)
|
| 354 |
+
|
| 355 |
+
response.raise_for_status()
|
| 356 |
+
|
| 357 |
+
# Extract ZIP
|
| 358 |
+
os.makedirs(self.path, exist_ok=True)
|
| 359 |
+
|
| 360 |
+
with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
|
| 361 |
+
# Extract to temp location first
|
| 362 |
+
temp_extract = os.path.join(self.local_dir, "_temp_extract")
|
| 363 |
+
zip_ref.extractall(temp_extract)
|
| 364 |
+
|
| 365 |
+
# GitHub ZIPs have a top-level folder like "repo-main", move contents up
|
| 366 |
+
extracted_items = os.listdir(temp_extract)
|
| 367 |
+
if len(extracted_items) == 1 and os.path.isdir(os.path.join(temp_extract, extracted_items[0])):
|
| 368 |
+
# Move contents of the single folder to our target path
|
| 369 |
+
source_dir = os.path.join(temp_extract, extracted_items[0])
|
| 370 |
+
for item in os.listdir(source_dir):
|
| 371 |
+
shutil.move(os.path.join(source_dir, item), os.path.join(self.path, item))
|
| 372 |
+
shutil.rmtree(temp_extract)
|
| 373 |
+
else:
|
| 374 |
+
# Move all items directly
|
| 375 |
+
for item in extracted_items:
|
| 376 |
+
shutil.move(os.path.join(temp_extract, item), os.path.join(self.path, item))
|
| 377 |
+
shutil.rmtree(temp_extract)
|
| 378 |
+
|
| 379 |
+
logger.info(f"Downloaded and extracted {self.repo_id} to {self.path}")
|
| 380 |
+
return True
|
| 381 |
+
except Exception as e:
|
| 382 |
+
logger.error(f"HTTP download failed for {self.repo_id}: {e}")
|
| 383 |
return False
|
| 384 |
|
| 385 |
def walk(self, get_content: bool = True) -> Generator[Tuple[Any, Dict], None, None]:
|