overworker / github_ingestion.py
luguog's picture
Upload github_ingestion.py with huggingface_hub
d4e41d9 verified
"""GitHub repo ingestion - fetches public repo text files using git."""
import subprocess
import tempfile
import shutil
import os
from typing import List, Optional
import re
from dataclasses import dataclass
@dataclass
class RepoFile:
path: str
content: str
size: int
@dataclass
class RepoStructure:
owner: str
repo: str
files: List[RepoFile]
readme: Optional[str] = None
class GitHubIngestor:
"""Fetches and parses GitHub repositories using git."""
def __init__(self):
# No API client needed - using git clone
pass
async def close(self):
# No resources to clean up
pass
def parse_repo_url(self, url: str) -> tuple[str, str]:
"""Extract owner and repo from GitHub URL."""
# Handle various GitHub URL formats
patterns = [
r"github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$",
r"github\.com/([^/]+)/([^/]+)$",
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
owner, repo = match.groups()
return owner, repo
raise ValueError(f"Invalid GitHub URL: {url}")
def is_text_file(self, path: str) -> bool:
"""Check if file is likely text-based."""
text_extensions = {
'.py', '.js', '.ts', '.tsx', '.jsx', '.md', '.txt', '.json', '.yaml', '.yml',
'.toml', '.cfg', '.ini', '.sh', '.bash', '.zsh', '.rs', '.go', '.java',
'.c', '.cpp', '.h', '.hpp', '.css', '.html', '.xml', '.sql', '.rb', '.php'
}
# Check extension
for ext in text_extensions:
if path.endswith(ext):
return True
# Common text filenames
text_filenames = {'README', 'LICENSE', 'CONTRIBUTING', 'CHANGELOG', 'Makefile'}
if any(path.upper().endswith(name) for name in text_filenames):
return True
return False
async def ingest_repo(self, url: str, max_files: int = 1000, max_total_bytes: int = 50_000_000) -> RepoStructure:
"""Main ingestion method - clones repo and reads all text files."""
owner, repo = self.parse_repo_url(url)
# Clone repo to temp directory
temp_dir = tempfile.mkdtemp()
try:
repo_url = f"https://github.com/{owner}/{repo}.git"
# Clone with depth 1 for speed
subprocess.run(
["git", "clone", "--depth", "1", repo_url, temp_dir],
check=True,
capture_output=True,
timeout=60
)
# Walk directory and read text files
files = []
readme_content = None
total_bytes = 0
for root, dirs, dir_files in os.walk(temp_dir):
# Skip .git directory
dirs[:] = [d for d in dirs if d != '.git']
for file in dir_files:
# Check file count limit
if len(files) >= max_files:
break
full_path = os.path.join(root, file)
rel_path = os.path.relpath(full_path, temp_dir)
# Check if text file
if self.is_text_file(rel_path):
try:
# Skip very large files (>100KB)
size = os.path.getsize(full_path)
if size > 100000:
continue
# Check total bytes limit
if total_bytes + size > max_total_bytes:
break
total_bytes += size
# Read file content
with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
file_obj = RepoFile(
path=rel_path,
content=content,
size=len(content)
)
files.append(file_obj)
# Extract README
if "README" in rel_path.upper():
readme_content = content
except (OSError, UnicodeDecodeError):
pass
return RepoStructure(
owner=owner,
repo=repo,
files=files,
readme=readme_content
)
finally:
# Clean up temp directory
shutil.rmtree(temp_dir, ignore_errors=True)