nihalaninihal commited on
Commit
fa708f6
·
verified ·
1 Parent(s): ec05dfb

Delete repository_selector.py

Browse files
Files changed (1) hide show
  1. repository_selector.py +0 -280
repository_selector.py DELETED
@@ -1,280 +0,0 @@
1
- from typing import Dict, List, Any
2
- from pathlib import Path
3
- from collections import defaultdict
4
- import subprocess
5
- from datetime import datetime
6
- from analyze_repository_structure import RELEVANT_EXTENSIONS
7
-
8
- class RepositorySelector:
9
- """Handles intelligent repository selection and authorship analysis"""
10
-
11
- def __init__(self, base_path: str, username: str):
12
- self.base_path = Path(base_path)
13
- self.username = username
14
- self.user_path = self.base_path / username
15
-
16
- def select_repositories(self, report_data: Dict) -> List[str]:
17
- """
18
- Main entry point for repository selection.
19
- Returns a list of repository names to analyze, including both best-scored repos
20
- and single-contributor repos.
21
- """
22
- # Store report data for use in other methods
23
- self.report_data = report_data
24
-
25
- # Get repositories with activity scores
26
- repositories = self._analyze_repositories(report_data)
27
- print(f"Found {len(repositories)} repositories with activity")
28
-
29
- # Get best scored repositories
30
- selected_repos = self._select_best_repositories(repositories)
31
- selected_repo_names = {repo["name"] for repo in selected_repos}
32
-
33
- # Get single-contributor repositories
34
- single_contributor_repos = self._get_only_owner_sources()
35
-
36
- # Combine both sets of repositories without duplicates
37
- all_repo_names = selected_repo_names.union(single_contributor_repos)
38
-
39
- print(f"Added {len(all_repo_names) - len(selected_repo_names)} single-contributor repositories")
40
- print(f"Total repositories to analyze: {len(all_repo_names)}")
41
-
42
- # Update metadata for all repositories
43
- self.repo_metadata = {}
44
- for repo in selected_repos:
45
- self.repo_metadata[repo["name"]] = {
46
- "contribution_files": repo["contribution_files"],
47
- "stats": repo["stats"]
48
- }
49
-
50
- # Add metadata for additional single-contributor repos if they weren't in selected_repos
51
- for repo_name in single_contributor_repos:
52
- if repo_name not in self.repo_metadata:
53
- repo_path = self.user_path / f"{self.username}_{repo_name}.git"
54
- if repo_path.exists():
55
- stats = self._get_repository_stats(repo_path, report_data.get("commits", {}).get(repo_name, []))
56
- contribution_files = self._analyze_contribution_files(repo_path)
57
- self.repo_metadata[repo_name] = {
58
- "contribution_files": contribution_files,
59
- "stats": stats or {}
60
- }
61
-
62
- return list(all_repo_names)
63
-
64
- def _get_only_owner_sources(self) -> List[str]:
65
- """Gets list of repositories to analyze. Only single-contributor repos are considered"""
66
- return [
67
- obj["repo"]
68
- for obj in self.report_data.get("contributors", [])
69
- if obj["contributors"][0] == self.username and len(obj["contributors"]) == 1
70
- ]
71
-
72
- # [Rest of the class methods remain unchanged...]
73
- def _analyze_repositories(self, report_data: Dict) -> List[Dict[str, Any]]:
74
- """Analyzes all repositories the user has contributed to"""
75
- repositories = []
76
-
77
- # Get repos from contributors data
78
- contributed_repos = [
79
- obj["repo"] for obj in report_data.get("contributors", [])
80
- if self.username in obj["contributors"]
81
- ]
82
-
83
- # Also get repos from commits data
84
- commit_repos = list(report_data.get("commits", {}).keys())
85
-
86
- # Combine and deduplicate
87
- all_repos = list(set(contributed_repos + commit_repos))
88
-
89
- print(f"Analyzing {len(all_repos)} repositories...")
90
-
91
- for repo_name in all_repos:
92
- repo_path = self.user_path / f"{self.username}_{repo_name}.git"
93
- if not repo_path.exists():
94
- continue
95
-
96
- repo_stats = self._get_repository_stats(repo_path, report_data.get("commits", {}).get(repo_name, []))
97
- if not repo_stats:
98
- continue
99
-
100
- contribution_files = self._analyze_contribution_files(repo_path)
101
-
102
- # Include repository if it has either commits or contribution files
103
- if repo_stats["commit_count"] > 0 or contribution_files:
104
- repositories.append({
105
- "name": repo_name,
106
- "stats": repo_stats,
107
- "contribution_files": contribution_files
108
- })
109
-
110
- return repositories
111
-
112
- def _analyze_contribution_files(self, repo_path: Path) -> List[Dict[str, Any]]:
113
- """Identifies files with user contributions, with more flexible criteria"""
114
- contribution_files = []
115
-
116
- # List all files in repository
117
- for file_path in repo_path.rglob('*'):
118
- relative_path = str(file_path.relative_to(repo_path))
119
-
120
- # Skip excluded paths and non-source files
121
- if not self._is_analyzable_file(relative_path):
122
- continue
123
-
124
- try:
125
- # Get authorship statistics
126
- author_stats = self._get_file_author_stats(repo_path, relative_path)
127
-
128
- # Include files where user has any meaningful contribution (>20%)
129
- if self.username in author_stats and author_stats[self.username] >= 20:
130
- contribution_files.append({
131
- "path": relative_path,
132
- "contribution_percentage": author_stats[self.username]
133
- })
134
-
135
- except Exception as e:
136
- print(f"Error analyzing {relative_path}: {str(e)}")
137
- continue
138
-
139
- return contribution_files
140
-
141
- def _get_repository_stats(self, repo_path: Path, repo_commits: List = None) -> Dict[str, Any]:
142
- """Analyzes repository activity metrics with both git log and commits data"""
143
- try:
144
- # Get commit timestamps from git log
145
- result = subprocess.run(
146
- 'git log --format=%at',
147
- cwd=repo_path,
148
- shell=True,
149
- capture_output=True,
150
- text=True
151
- )
152
-
153
- if result.returncode != 0:
154
- return {}
155
-
156
- timestamps = [int(ts) for ts in result.stdout.strip().split('\n') if ts]
157
-
158
- # Also consider commits from report data
159
- if repo_commits:
160
- for commit in repo_commits:
161
- commit_date = datetime.fromisoformat(
162
- commit["commit"]["author"]["date"].replace("Z", "+00:00")
163
- )
164
- timestamps.append(int(commit_date.timestamp()))
165
-
166
- if not timestamps:
167
- return {}
168
-
169
- first_commit = datetime.fromtimestamp(min(timestamps))
170
- last_commit = datetime.fromtimestamp(max(timestamps))
171
- commit_count = len(timestamps)
172
- time_period = (last_commit - first_commit).days + 1
173
-
174
- return {
175
- "first_commit": first_commit.isoformat(),
176
- "last_commit": last_commit.isoformat(),
177
- "commit_count": commit_count,
178
- "commits_per_day": commit_count / max(time_period, 1),
179
- "active_days": time_period
180
- }
181
-
182
- except Exception as e:
183
- print(f"Error analyzing repository stats: {str(e)}")
184
- return {}
185
-
186
- def _get_file_author_stats(self, repo_path: Path, file_path: str) -> Dict[str, float]:
187
- """Analyzes file authorship percentages"""
188
- try:
189
- result = subprocess.run(
190
- ['git', 'blame', '--porcelain', file_path],
191
- cwd=repo_path,
192
- capture_output=True,
193
- text=True
194
- )
195
-
196
- if result.returncode != 0:
197
- return {}
198
-
199
- author_lines = defaultdict(int)
200
- total_lines = 0
201
-
202
- for line in result.stdout.split('\n'):
203
- if line.startswith('author '):
204
- author = line.replace('author ', '', 1)
205
- author_lines[author] += 1
206
- total_lines += 1
207
-
208
- if total_lines == 0:
209
- return {}
210
-
211
- return {
212
- author: (count / total_lines * 100)
213
- for author, count in author_lines.items()
214
- }
215
-
216
- except Exception as e:
217
- print(f"Error getting authorship stats for {file_path}: {str(e)}")
218
- return {}
219
-
220
- def _select_best_repositories(self, repositories: List[Dict[str, Any]],
221
- max_repos: int = 15) -> List[Dict[str, Any]]:
222
- """Selects optimal repositories using more balanced scoring"""
223
- if not repositories:
224
- return []
225
-
226
- for repo in repositories:
227
- score = 0
228
- stats = repo["stats"]
229
-
230
- # Recency score (max 35 points)
231
- last_commit = datetime.fromisoformat(stats["last_commit"])
232
- days_since_last_commit = (datetime.now() - last_commit).days
233
- score += max(0, 35 - (days_since_last_commit / 30))
234
-
235
- # Activity score (max 35 points)
236
- commit_score = min(35, (stats["commit_count"] * 2) + (stats["commits_per_day"] * 10))
237
- score += commit_score
238
-
239
- # Contribution score (max 30 points)
240
- # Consider both number and quality of contributions
241
- contribution_files = repo["contribution_files"]
242
- if contribution_files:
243
- file_count = len(contribution_files)
244
- avg_contribution = sum(f["contribution_percentage"] for f in contribution_files) / file_count
245
- score += min(30, (file_count * 2) + (avg_contribution / 5))
246
- else:
247
- # Still give some points for commits if no files detected
248
- score += min(15, stats["commit_count"] / 2)
249
-
250
- repo["analysis_score"] = score
251
-
252
- # Sort by score and return top repositories
253
- repositories.sort(key=lambda x: x["analysis_score"], reverse=True)
254
- selected = repositories[:max_repos]
255
-
256
- print(f"\nSelected {len(selected)} repositories:")
257
- for repo in selected:
258
- print(f"- {repo['name']} (score: {repo['analysis_score']:.2f})")
259
-
260
- return selected
261
-
262
- def _is_analyzable_file(self, file_path: str) -> bool:
263
- """Determines if a file should be included in analysis"""
264
- path = Path(file_path)
265
-
266
- # Skip excluded directories
267
- excluded_paths = {
268
- 'node_modules', '__pycache__', 'build', 'dist', '.git',
269
- 'vendor', 'third_party', 'external'
270
- }
271
-
272
- if any(part in excluded_paths for part in path.parts):
273
- return False
274
-
275
- # Get file extension (lowercase)
276
- ext = path.suffix.lower()
277
- if not ext:
278
- return False
279
-
280
- return ext in RELEVANT_EXTENSIONS