Spaces:

luguog
/

overworker

Running

App Files Files Community

luguog commited on 29 days ago

Commit

d4e41d9

verified ·

1 Parent(s): 33e587f

Upload github_ingestion.py with huggingface_hub

Browse files

Files changed (1) hide show

github_ingestion.py +149 -0

github_ingestion.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""GitHub repo ingestion - fetches public repo text files using git."""
+import subprocess
+import tempfile
+import shutil
+import os
+from typing import List, Optional
+import re
+from dataclasses import dataclass
+@dataclass
+class RepoFile:
+    path: str
+    content: str
+    size: int
+@dataclass
+class RepoStructure:
+    owner: str
+    repo: str
+    files: List[RepoFile]
+    readme: Optional[str] = None
+class GitHubIngestor:
+    """Fetches and parses GitHub repositories using git."""
+    def __init__(self):
+        # No API client needed - using git clone
+        pass
+    async def close(self):
+        # No resources to clean up
+        pass
+    def parse_repo_url(self, url: str) -> tuple[str, str]:
+        """Extract owner and repo from GitHub URL."""
+        # Handle various GitHub URL formats
+        patterns = [
+            r"github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$",
+            r"github\.com/([^/]+)/([^/]+)$",
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, url)
+            if match:
+                owner, repo = match.groups()
+                return owner, repo
+        raise ValueError(f"Invalid GitHub URL: {url}")
+    def is_text_file(self, path: str) -> bool:
+        """Check if file is likely text-based."""
+        text_extensions = {
+            '.py', '.js', '.ts', '.tsx', '.jsx', '.md', '.txt', '.json', '.yaml', '.yml',
+            '.toml', '.cfg', '.ini', '.sh', '.bash', '.zsh', '.rs', '.go', '.java',
+            '.c', '.cpp', '.h', '.hpp', '.css', '.html', '.xml', '.sql', '.rb', '.php'
+        }
+        # Check extension
+        for ext in text_extensions:
+            if path.endswith(ext):
+                return True
+        # Common text filenames
+        text_filenames = {'README', 'LICENSE', 'CONTRIBUTING', 'CHANGELOG', 'Makefile'}
+        if any(path.upper().endswith(name) for name in text_filenames):
+            return True
+        return False
+    async def ingest_repo(self, url: str, max_files: int = 1000, max_total_bytes: int = 50_000_000) -> RepoStructure:
+        """Main ingestion method - clones repo and reads all text files."""
+        owner, repo = self.parse_repo_url(url)
+        # Clone repo to temp directory
+        temp_dir = tempfile.mkdtemp()
+        try:
+            repo_url = f"https://github.com/{owner}/{repo}.git"
+            # Clone with depth 1 for speed
+            subprocess.run(
+                ["git", "clone", "--depth", "1", repo_url, temp_dir],
+                check=True,
+                capture_output=True,
+                timeout=60
+            )
+            # Walk directory and read text files
+            files = []
+            readme_content = None
+            total_bytes = 0
+            for root, dirs, dir_files in os.walk(temp_dir):
+                # Skip .git directory
+                dirs[:] = [d for d in dirs if d != '.git']
+                for file in dir_files:
+                    # Check file count limit
+                    if len(files) >= max_files:
+                        break
+                    full_path = os.path.join(root, file)
+                    rel_path = os.path.relpath(full_path, temp_dir)
+                    # Check if text file
+                    if self.is_text_file(rel_path):
+                        try:
+                            # Skip very large files (>100KB)
+                            size = os.path.getsize(full_path)
+                            if size > 100000:
+                                continue
+                            # Check total bytes limit
+                            if total_bytes + size > max_total_bytes:
+                                break
+                            total_bytes += size
+                            # Read file content
+                            with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
+                                content = f.read()
+                            file_obj = RepoFile(
+                                path=rel_path,
+                                content=content,
+                                size=len(content)
+                            )
+                            files.append(file_obj)
+                            # Extract README
+                            if "README" in rel_path.upper():
+                                readme_content = content
+                        except (OSError, UnicodeDecodeError):
+                            pass
+            return RepoStructure(
+                owner=owner,
+                repo=repo,
+                files=files,
+                readme=readme_content
+            )
+        finally:
+            # Clean up temp directory
+            shutil.rmtree(temp_dir, ignore_errors=True)