Spaces:
Sleeping
Sleeping
Commit ·
a8b3a89
1
Parent(s): c7355e0
Fix missing await for async AI service calls
Browse files- services/github_service.py +48 -0
- services/rag_data_prep.py +92 -19
services/github_service.py
CHANGED
|
@@ -438,6 +438,54 @@ class GitHubService:
|
|
| 438 |
except Exception as e:
|
| 439 |
logger.error(f"README fetch error for {repo_full_name}: {e}")
|
| 440 |
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
|
| 442 |
|
| 443 |
# Singleton instance
|
|
|
|
| 438 |
except Exception as e:
|
| 439 |
logger.error(f"README fetch error for {repo_full_name}: {e}")
|
| 440 |
return ""
|
| 441 |
+
|
| 442 |
+
async def fetch_contributing_file(self, repo_full_name: str, github_access_token: Optional[str] = None) -> str:
|
| 443 |
+
"""
|
| 444 |
+
Fetch the CONTRIBUTING.md content for a repository.
|
| 445 |
+
Tries multiple common paths: CONTRIBUTING.md, .github/CONTRIBUTING.md, docs/CONTRIBUTING.md
|
| 446 |
+
"""
|
| 447 |
+
paths_to_try = [
|
| 448 |
+
"CONTRIBUTING.md",
|
| 449 |
+
".github/CONTRIBUTING.md",
|
| 450 |
+
"docs/CONTRIBUTING.md",
|
| 451 |
+
"contributing.md",
|
| 452 |
+
]
|
| 453 |
+
|
| 454 |
+
try:
|
| 455 |
+
async with httpx.AsyncClient() as client:
|
| 456 |
+
headers = {"Accept": "application/vnd.github.raw+json"}
|
| 457 |
+
if github_access_token:
|
| 458 |
+
headers["Authorization"] = f"Bearer {github_access_token}"
|
| 459 |
+
|
| 460 |
+
for path in paths_to_try:
|
| 461 |
+
url = f"{self.base_url}/repos/{repo_full_name}/contents/{path}"
|
| 462 |
+
response = await client.get(url, headers=headers, timeout=30.0)
|
| 463 |
+
|
| 464 |
+
if response.status_code == 200:
|
| 465 |
+
return response.text
|
| 466 |
+
|
| 467 |
+
return ""
|
| 468 |
+
|
| 469 |
+
except Exception as e:
|
| 470 |
+
logger.error(f"CONTRIBUTING fetch error for {repo_full_name}: {e}")
|
| 471 |
+
return ""
|
| 472 |
+
|
| 473 |
+
async def fetch_repository_docs(
|
| 474 |
+
self,
|
| 475 |
+
repo_full_name: str,
|
| 476 |
+
github_access_token: Optional[str] = None
|
| 477 |
+
) -> Dict[str, str]:
|
| 478 |
+
"""
|
| 479 |
+
Fetch README and CONTRIBUTING files for RAG indexing.
|
| 480 |
+
Returns dict with 'readme' and 'contributing' keys.
|
| 481 |
+
"""
|
| 482 |
+
readme = await self.fetch_repository_readme(repo_full_name, github_access_token)
|
| 483 |
+
contributing = await self.fetch_contributing_file(repo_full_name, github_access_token)
|
| 484 |
+
|
| 485 |
+
return {
|
| 486 |
+
"readme": readme,
|
| 487 |
+
"contributing": contributing
|
| 488 |
+
}
|
| 489 |
|
| 490 |
|
| 491 |
# Singleton instance
|
services/rag_data_prep.py
CHANGED
|
@@ -93,6 +93,53 @@ class RAGDataPrep:
|
|
| 93 |
|
| 94 |
return text.strip()
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
def _simple_tokenize(self, text: str) -> List[str]:
|
| 97 |
"""
|
| 98 |
Simple word-based tokenization.
|
|
@@ -225,7 +272,7 @@ class RAGDataPrep:
|
|
| 225 |
Fetch documents from MongoDB for RAG preparation.
|
| 226 |
|
| 227 |
Args:
|
| 228 |
-
doc_types: Types to fetch (issue, pr, comment)
|
| 229 |
repo_names: Optional filter by repository
|
| 230 |
|
| 231 |
Returns:
|
|
@@ -233,29 +280,55 @@ class RAGDataPrep:
|
|
| 233 |
"""
|
| 234 |
from config.database import db
|
| 235 |
|
| 236 |
-
doc_types = doc_types or ["issue", "pr", "comment", "readme"]
|
| 237 |
documents = []
|
| 238 |
|
| 239 |
-
|
|
|
|
| 240 |
from services.github_service import github_service
|
| 241 |
|
| 242 |
for repo in repo_names:
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
if "issue" in doc_types or "pr" in doc_types:
|
| 261 |
query = {}
|
|
|
|
| 93 |
|
| 94 |
return text.strip()
|
| 95 |
|
| 96 |
+
def _detect_priority_sections(self, content: str, doc_type: str) -> str:
|
| 97 |
+
"""
|
| 98 |
+
Detect if content contains high-priority sections for contributor context.
|
| 99 |
+
|
| 100 |
+
Args:
|
| 101 |
+
content: Document content
|
| 102 |
+
doc_type: Type of document (readme, contributing)
|
| 103 |
+
|
| 104 |
+
Returns:
|
| 105 |
+
Priority level: 'high', 'medium', or 'normal'
|
| 106 |
+
"""
|
| 107 |
+
if doc_type == "contributing":
|
| 108 |
+
return "high" # CONTRIBUTING.md is always high priority
|
| 109 |
+
|
| 110 |
+
content_lower = content.lower()
|
| 111 |
+
high_priority_patterns = [
|
| 112 |
+
"getting started",
|
| 113 |
+
"project setup",
|
| 114 |
+
"installation",
|
| 115 |
+
"how to contribute",
|
| 116 |
+
"contributor guidelines",
|
| 117 |
+
"development setup",
|
| 118 |
+
"quick start",
|
| 119 |
+
"for contributors",
|
| 120 |
+
"contributing",
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
medium_priority_patterns = [
|
| 124 |
+
"requirements",
|
| 125 |
+
"dependencies",
|
| 126 |
+
"building",
|
| 127 |
+
"testing",
|
| 128 |
+
"documentation",
|
| 129 |
+
]
|
| 130 |
+
|
| 131 |
+
# Check for high-priority sections
|
| 132 |
+
high_count = sum(1 for pattern in high_priority_patterns if pattern in content_lower)
|
| 133 |
+
if high_count >= 2:
|
| 134 |
+
return "high"
|
| 135 |
+
|
| 136 |
+
# Check for medium-priority sections
|
| 137 |
+
medium_count = sum(1 for pattern in medium_priority_patterns if pattern in content_lower)
|
| 138 |
+
if high_count >= 1 or medium_count >= 2:
|
| 139 |
+
return "medium"
|
| 140 |
+
|
| 141 |
+
return "normal"
|
| 142 |
+
|
| 143 |
def _simple_tokenize(self, text: str) -> List[str]:
|
| 144 |
"""
|
| 145 |
Simple word-based tokenization.
|
|
|
|
| 272 |
Fetch documents from MongoDB for RAG preparation.
|
| 273 |
|
| 274 |
Args:
|
| 275 |
+
doc_types: Types to fetch (issue, pr, comment, readme, contributing)
|
| 276 |
repo_names: Optional filter by repository
|
| 277 |
|
| 278 |
Returns:
|
|
|
|
| 280 |
"""
|
| 281 |
from config.database import db
|
| 282 |
|
| 283 |
+
doc_types = doc_types or ["issue", "pr", "comment", "readme", "contributing"]
|
| 284 |
documents = []
|
| 285 |
|
| 286 |
+
# Fetch README and CONTRIBUTING files with high priority
|
| 287 |
+
if repo_names:
|
| 288 |
from services.github_service import github_service
|
| 289 |
|
| 290 |
for repo in repo_names:
|
| 291 |
+
# Fetch README
|
| 292 |
+
if "readme" in doc_types:
|
| 293 |
+
try:
|
| 294 |
+
content = await github_service.fetch_repository_readme(repo, github_access_token)
|
| 295 |
+
if content:
|
| 296 |
+
# Extract key sections for high priority tagging
|
| 297 |
+
priority = self._detect_priority_sections(content, "readme")
|
| 298 |
+
documents.append({
|
| 299 |
+
"document_id": f"{repo}_readme",
|
| 300 |
+
"document_type": "readme",
|
| 301 |
+
"source_repo": repo,
|
| 302 |
+
"title": "Project README",
|
| 303 |
+
"body": content,
|
| 304 |
+
"author": "System",
|
| 305 |
+
"number": 0,
|
| 306 |
+
"state": "active",
|
| 307 |
+
"priority": priority,
|
| 308 |
+
"created_at": datetime.now(timezone.utc).isoformat()
|
| 309 |
+
})
|
| 310 |
+
except Exception as e:
|
| 311 |
+
logger.error(f"Failed to fetch README for {repo}: {e}")
|
| 312 |
+
|
| 313 |
+
# Fetch CONTRIBUTING.md (high priority for contributor context)
|
| 314 |
+
if "contributing" in doc_types:
|
| 315 |
+
try:
|
| 316 |
+
content = await github_service.fetch_contributing_file(repo, github_access_token)
|
| 317 |
+
if content:
|
| 318 |
+
documents.append({
|
| 319 |
+
"document_id": f"{repo}_contributing",
|
| 320 |
+
"document_type": "contributing",
|
| 321 |
+
"source_repo": repo,
|
| 322 |
+
"title": "Contributor Guidelines",
|
| 323 |
+
"body": content,
|
| 324 |
+
"author": "System",
|
| 325 |
+
"number": 0,
|
| 326 |
+
"state": "active",
|
| 327 |
+
"priority": "high", # Always high priority
|
| 328 |
+
"created_at": datetime.now(timezone.utc).isoformat()
|
| 329 |
+
})
|
| 330 |
+
except Exception as e:
|
| 331 |
+
logger.error(f"Failed to fetch CONTRIBUTING for {repo}: {e}")
|
| 332 |
|
| 333 |
if "issue" in doc_types or "pr" in doc_types:
|
| 334 |
query = {}
|