Asish Karthikeya Gogineni commited on
Commit
c06be9c
·
1 Parent(s): 82387e1

feat: Add HTTP ZIP download fallback for GitHub repos

Browse files

- GitHubRepoManager now tries git clone first, falls back to HTTP ZIP download
- Downloads repo as ZIP from GitHub API if GitPython fails
- Fixes GitHub repo ingestion on Hugging Face Spaces where git may not work
- Handles both 'main' and 'master' branch naming conventions

Files changed (1) hide show
  1. code_chatbot/universal_ingestor.py +55 -4
code_chatbot/universal_ingestor.py CHANGED
@@ -301,11 +301,12 @@ class GitHubRepoManager(DataManager):
301
  self.path = os.path.join(local_dir, repo_id.replace("/", "_"))
302
 
303
  def download(self) -> bool:
304
- """Clones the GitHub repository."""
305
  if os.path.exists(self.path) and os.listdir(self.path):
306
  logger.info(f"Repo already cloned at {self.path}")
307
  return True
308
 
 
309
  try:
310
  from git import Repo, GitCommandError
311
 
@@ -325,10 +326,60 @@ class GitHubRepoManager(DataManager):
325
  logger.info(f"Cloned {self.repo_id} to {self.path}")
326
  return True
327
  except ImportError:
328
- logger.error("GitPython not installed. Install with: pip install gitpython")
329
- raise
330
  except Exception as e:
331
- logger.error(f"Failed to clone {self.repo_id}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  return False
333
 
334
  def walk(self, get_content: bool = True) -> Generator[Tuple[Any, Dict], None, None]:
 
301
  self.path = os.path.join(local_dir, repo_id.replace("/", "_"))
302
 
303
  def download(self) -> bool:
304
+ """Clones the GitHub repository. Falls back to HTTP ZIP download if git fails."""
305
  if os.path.exists(self.path) and os.listdir(self.path):
306
  logger.info(f"Repo already cloned at {self.path}")
307
  return True
308
 
309
+ # Try git clone first
310
  try:
311
  from git import Repo, GitCommandError
312
 
 
326
  logger.info(f"Cloned {self.repo_id} to {self.path}")
327
  return True
328
  except ImportError:
329
+ logger.warning("GitPython not available, falling back to HTTP download")
 
330
  except Exception as e:
331
+ logger.warning(f"Git clone failed: {e}, falling back to HTTP download")
332
+
333
+ # Fallback: Download as ZIP via GitHub API
334
+ try:
335
+ return self._download_as_zip()
336
+ except Exception as e:
337
+ logger.error(f"Failed to download {self.repo_id}: {e}")
338
+ return False
339
+
340
+ def _download_as_zip(self) -> bool:
341
+ """Download repo as ZIP from GitHub API (fallback method)."""
342
+ import io
343
+
344
+ # Download ZIP from GitHub
345
+ zip_url = f"https://github.com/{self.repo_id}/archive/refs/heads/main.zip"
346
+ logger.info(f"Downloading {self.repo_id} as ZIP from {zip_url}")
347
+
348
+ try:
349
+ response = requests.get(zip_url, timeout=60)
350
+ if response.status_code == 404:
351
+ # Try 'master' branch if 'main' doesn't exist
352
+ zip_url = f"https://github.com/{self.repo_id}/archive/refs/heads/master.zip"
353
+ response = requests.get(zip_url, timeout=60)
354
+
355
+ response.raise_for_status()
356
+
357
+ # Extract ZIP
358
+ os.makedirs(self.path, exist_ok=True)
359
+
360
+ with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
361
+ # Extract to temp location first
362
+ temp_extract = os.path.join(self.local_dir, "_temp_extract")
363
+ zip_ref.extractall(temp_extract)
364
+
365
+ # GitHub ZIPs have a top-level folder like "repo-main", move contents up
366
+ extracted_items = os.listdir(temp_extract)
367
+ if len(extracted_items) == 1 and os.path.isdir(os.path.join(temp_extract, extracted_items[0])):
368
+ # Move contents of the single folder to our target path
369
+ source_dir = os.path.join(temp_extract, extracted_items[0])
370
+ for item in os.listdir(source_dir):
371
+ shutil.move(os.path.join(source_dir, item), os.path.join(self.path, item))
372
+ shutil.rmtree(temp_extract)
373
+ else:
374
+ # Move all items directly
375
+ for item in extracted_items:
376
+ shutil.move(os.path.join(temp_extract, item), os.path.join(self.path, item))
377
+ shutil.rmtree(temp_extract)
378
+
379
+ logger.info(f"Downloaded and extracted {self.repo_id} to {self.path}")
380
+ return True
381
+ except Exception as e:
382
+ logger.error(f"HTTP download failed for {self.repo_id}: {e}")
383
  return False
384
 
385
  def walk(self, get_content: bool = True) -> Generator[Tuple[Any, Dict], None, None]: