luguog commited on
Commit
d4e41d9
·
verified ·
1 Parent(s): 33e587f

Upload github_ingestion.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. github_ingestion.py +149 -0
github_ingestion.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """GitHub repo ingestion - fetches public repo text files using git."""
2
+ import subprocess
3
+ import tempfile
4
+ import shutil
5
+ import os
6
+ from typing import List, Optional
7
+ import re
8
+ from dataclasses import dataclass
9
+
10
+
11
+ @dataclass
12
+ class RepoFile:
13
+ path: str
14
+ content: str
15
+ size: int
16
+
17
+
18
+ @dataclass
19
+ class RepoStructure:
20
+ owner: str
21
+ repo: str
22
+ files: List[RepoFile]
23
+ readme: Optional[str] = None
24
+
25
+
26
+ class GitHubIngestor:
27
+ """Fetches and parses GitHub repositories using git."""
28
+
29
+ def __init__(self):
30
+ # No API client needed - using git clone
31
+ pass
32
+
33
+ async def close(self):
34
+ # No resources to clean up
35
+ pass
36
+
37
+ def parse_repo_url(self, url: str) -> tuple[str, str]:
38
+ """Extract owner and repo from GitHub URL."""
39
+ # Handle various GitHub URL formats
40
+ patterns = [
41
+ r"github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$",
42
+ r"github\.com/([^/]+)/([^/]+)$",
43
+ ]
44
+
45
+ for pattern in patterns:
46
+ match = re.search(pattern, url)
47
+ if match:
48
+ owner, repo = match.groups()
49
+ return owner, repo
50
+
51
+ raise ValueError(f"Invalid GitHub URL: {url}")
52
+
53
+
54
+ def is_text_file(self, path: str) -> bool:
55
+ """Check if file is likely text-based."""
56
+ text_extensions = {
57
+ '.py', '.js', '.ts', '.tsx', '.jsx', '.md', '.txt', '.json', '.yaml', '.yml',
58
+ '.toml', '.cfg', '.ini', '.sh', '.bash', '.zsh', '.rs', '.go', '.java',
59
+ '.c', '.cpp', '.h', '.hpp', '.css', '.html', '.xml', '.sql', '.rb', '.php'
60
+ }
61
+
62
+ # Check extension
63
+ for ext in text_extensions:
64
+ if path.endswith(ext):
65
+ return True
66
+
67
+ # Common text filenames
68
+ text_filenames = {'README', 'LICENSE', 'CONTRIBUTING', 'CHANGELOG', 'Makefile'}
69
+ if any(path.upper().endswith(name) for name in text_filenames):
70
+ return True
71
+
72
+ return False
73
+
74
+ async def ingest_repo(self, url: str, max_files: int = 1000, max_total_bytes: int = 50_000_000) -> RepoStructure:
75
+ """Main ingestion method - clones repo and reads all text files."""
76
+ owner, repo = self.parse_repo_url(url)
77
+
78
+ # Clone repo to temp directory
79
+ temp_dir = tempfile.mkdtemp()
80
+ try:
81
+ repo_url = f"https://github.com/{owner}/{repo}.git"
82
+
83
+ # Clone with depth 1 for speed
84
+ subprocess.run(
85
+ ["git", "clone", "--depth", "1", repo_url, temp_dir],
86
+ check=True,
87
+ capture_output=True,
88
+ timeout=60
89
+ )
90
+
91
+ # Walk directory and read text files
92
+ files = []
93
+ readme_content = None
94
+ total_bytes = 0
95
+
96
+ for root, dirs, dir_files in os.walk(temp_dir):
97
+ # Skip .git directory
98
+ dirs[:] = [d for d in dirs if d != '.git']
99
+
100
+ for file in dir_files:
101
+ # Check file count limit
102
+ if len(files) >= max_files:
103
+ break
104
+
105
+ full_path = os.path.join(root, file)
106
+ rel_path = os.path.relpath(full_path, temp_dir)
107
+
108
+ # Check if text file
109
+ if self.is_text_file(rel_path):
110
+ try:
111
+ # Skip very large files (>100KB)
112
+ size = os.path.getsize(full_path)
113
+ if size > 100000:
114
+ continue
115
+
116
+ # Check total bytes limit
117
+ if total_bytes + size > max_total_bytes:
118
+ break
119
+
120
+ total_bytes += size
121
+
122
+ # Read file content
123
+ with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
124
+ content = f.read()
125
+
126
+ file_obj = RepoFile(
127
+ path=rel_path,
128
+ content=content,
129
+ size=len(content)
130
+ )
131
+ files.append(file_obj)
132
+
133
+ # Extract README
134
+ if "README" in rel_path.upper():
135
+ readme_content = content
136
+
137
+ except (OSError, UnicodeDecodeError):
138
+ pass
139
+
140
+ return RepoStructure(
141
+ owner=owner,
142
+ repo=repo,
143
+ files=files,
144
+ readme=readme_content
145
+ )
146
+
147
+ finally:
148
+ # Clean up temp directory
149
+ shutil.rmtree(temp_dir, ignore_errors=True)