Spaces:

luguog
/

overworker

Running

App Files Files Community

overworker / github_ingestion.py

luguog

Upload github_ingestion.py with huggingface_hub

d4e41d9 verified 21 days ago

raw

history blame contribute delete

5.09 kB

	"""GitHub repo ingestion - fetches public repo text files using git."""
	import subprocess
	import tempfile
	import shutil
	import os
	from typing import List, Optional
	import re
	from dataclasses import dataclass


	@dataclass
	class RepoFile:
	path: str
	content: str
	size: int


	@dataclass
	class RepoStructure:
	owner: str
	repo: str
	files: List[RepoFile]
	readme: Optional[str] = None


	class GitHubIngestor:
	"""Fetches and parses GitHub repositories using git."""

	def __init__(self):
	# No API client needed - using git clone
	pass

	async def close(self):
	# No resources to clean up
	pass

	def parse_repo_url(self, url: str) -> tuple[str, str]:
	"""Extract owner and repo from GitHub URL."""
	# Handle various GitHub URL formats
	patterns = [
	r"github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$",
	r"github\.com/([^/]+)/([^/]+)$",
	]

	for pattern in patterns:
	match = re.search(pattern, url)
	if match:
	owner, repo = match.groups()
	return owner, repo

	raise ValueError(f"Invalid GitHub URL: {url}")


	def is_text_file(self, path: str) -> bool:
	"""Check if file is likely text-based."""
	text_extensions = {
	'.py', '.js', '.ts', '.tsx', '.jsx', '.md', '.txt', '.json', '.yaml', '.yml',
	'.toml', '.cfg', '.ini', '.sh', '.bash', '.zsh', '.rs', '.go', '.java',
	'.c', '.cpp', '.h', '.hpp', '.css', '.html', '.xml', '.sql', '.rb', '.php'
	}

	# Check extension
	for ext in text_extensions:
	if path.endswith(ext):
	return True

	# Common text filenames
	text_filenames = {'README', 'LICENSE', 'CONTRIBUTING', 'CHANGELOG', 'Makefile'}
	if any(path.upper().endswith(name) for name in text_filenames):
	return True

	return False

	async def ingest_repo(self, url: str, max_files: int = 1000, max_total_bytes: int = 50_000_000) -> RepoStructure:
	"""Main ingestion method - clones repo and reads all text files."""
	owner, repo = self.parse_repo_url(url)

	# Clone repo to temp directory
	temp_dir = tempfile.mkdtemp()
	try:
	repo_url = f"https://github.com/{owner}/{repo}.git"

	# Clone with depth 1 for speed
	subprocess.run(
	["git", "clone", "--depth", "1", repo_url, temp_dir],
	check=True,
	capture_output=True,
	timeout=60
	)

	# Walk directory and read text files
	files = []
	readme_content = None
	total_bytes = 0

	for root, dirs, dir_files in os.walk(temp_dir):
	# Skip .git directory
	dirs[:] = [d for d in dirs if d != '.git']

	for file in dir_files:
	# Check file count limit
	if len(files) >= max_files:
	break

	full_path = os.path.join(root, file)
	rel_path = os.path.relpath(full_path, temp_dir)

	# Check if text file
	if self.is_text_file(rel_path):
	try:
	# Skip very large files (>100KB)
	size = os.path.getsize(full_path)
	if size > 100000:
	continue

	# Check total bytes limit
	if total_bytes + size > max_total_bytes:
	break

	total_bytes += size

	# Read file content
	with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
	content = f.read()

	file_obj = RepoFile(
	path=rel_path,
	content=content,
	size=len(content)
	)
	files.append(file_obj)

	# Extract README
	if "README" in rel_path.upper():
	readme_content = content

	except (OSError, UnicodeDecodeError):
	pass

	return RepoStructure(
	owner=owner,
	repo=repo,
	files=files,
	readme=readme_content
	)

	finally:
	# Clean up temp directory
	shutil.rmtree(temp_dir, ignore_errors=True)