import requests import re from typing import List, Optional from config import settings from schemas import RetrievedEvidence, SourceType GITHUB_SEARCH_API = "https://api.github.com/search/issues" def clean_text(text: Optional[str]) -> str: if not text: return "" return str(text).strip() def tokenize(text: Optional[str]) -> List[str]: cleaned = clean_text(text).lower() return re.findall(r"[a-zA-Z_][a-zA-Z0-9_+#.-]*", cleaned) def extract_error_keywords(error_message: Optional[str]) -> List[str]: if not error_message: return [] raw = clean_text(error_message) keywords = re.findall(r"[A-Za-z]+Error|[A-Za-z]+Exception", raw) keywords.extend(tokenize(raw)) seen = set() result = [] for item in keywords: lower = item.lower() if lower not in seen and len(lower) > 2: seen.add(lower) result.append(item) return result[:8] def build_github_query( message: str, error_message: Optional[str] = None, language: Optional[str] = None, framework: Optional[str] = None, ) -> str: parts = [] error_keywords = extract_error_keywords(error_message) if error_keywords: parts.append(f'"{error_keywords[0]}"') if framework: parts.append(clean_text(framework)) if language: parts.append(clean_text(language)) message_tokens = tokenize(message) filtered_message_tokens = [ token for token in message_tokens if token.lower() not in { "fix", "this", "code", "issue", "problem", "help", "please" } ] parts.extend(filtered_message_tokens[:3]) parts.append("is:issue") return " ".join(part for part in parts if part).strip() def compute_github_relevance( title: str, snippet: str, message: str, error_message: Optional[str], language: Optional[str], framework: Optional[str], comments: int, state: str, ) -> float: title_l = clean_text(title).lower() snippet_l = clean_text(snippet).lower() relevance = 0.0 if state == "closed": relevance += 1.5 relevance += min(comments, 20) * 0.15 if language and clean_text(language).lower() in title_l: relevance += 2.0 if framework and clean_text(framework).lower() in title_l: relevance += 3.0 if framework and clean_text(framework).lower() in snippet_l: relevance += 1.5 error_keywords = extract_error_keywords(error_message) for keyword in error_keywords[:4]: k = keyword.lower() if k in title_l: relevance += 5.0 elif k in snippet_l: relevance += 2.0 message_tokens = tokenize(message) for token in message_tokens[:6]: t = token.lower() if len(t) < 4: continue if t in title_l: relevance += 1.0 elif t in snippet_l: relevance += 0.5 return relevance def is_github_result_relevant( title: str, snippet: str, message: str, error_message: Optional[str], language: Optional[str], framework: Optional[str], ) -> bool: title_l = clean_text(title).lower() snippet_l = clean_text(snippet).lower() if framework and clean_text(framework).lower() in title_l: return True if framework and clean_text(framework).lower() in snippet_l: return True if language and clean_text(language).lower() in title_l: return True error_keywords = extract_error_keywords(error_message) for keyword in error_keywords[:3]: k = keyword.lower() if k in title_l or k in snippet_l: return True message_tokens = tokenize(message) matched = 0 for token in message_tokens[:6]: t = token.lower() if len(t) < 4: continue if t in title_l or t in snippet_l: matched += 1 return matched >= 2 def search_github( message: str, error_message: Optional[str] = None, language: Optional[str] = None, framework: Optional[str] = None, max_results: Optional[int] = None, ) -> List[RetrievedEvidence]: query = build_github_query( message=message, error_message=error_message, language=language, framework=framework, ) if not query: return [] headers = { "Accept": "application/vnd.github+json", } if settings.GITHUB_TOKEN: headers["Authorization"] = f"Bearer {settings.GITHUB_TOKEN}" params = { "q": query, "sort": "updated", "order": "desc", "per_page": max((max_results or settings.MAX_GITHUB_RESULTS) * 2, 6), } try: response = requests.get( GITHUB_SEARCH_API, headers=headers, params=params, timeout=settings.SEARCH_TIMEOUT_SECONDS, ) response.raise_for_status() data = response.json() except Exception as e: print(f"GitHub search failed: {e}") return [] items = data.get("items", []) evidence_list: List[RetrievedEvidence] = [] for item in items: title = clean_text(item.get("title")) url = clean_text(item.get("html_url")) state = clean_text(item.get("state")) comments = int(item.get("comments", 0)) body = clean_text(item.get("body", ""))[:500] repo_full_name = clean_text(item.get("repository_url", "").split("/repos/")[-1]) if not title: continue snippet_parts = [] if repo_full_name: snippet_parts.append(f"Repo: {repo_full_name}") if state: snippet_parts.append(f"State: {state}") snippet_parts.append(f"Comments: {comments}") if body: snippet_parts.append(f"Body: {body}") snippet = " | ".join(snippet_parts) if not is_github_result_relevant( title=title, snippet=snippet, message=message, error_message=error_message, language=language, framework=framework, ): continue relevance = compute_github_relevance( title=title, snippet=snippet, message=message, error_message=error_message, language=language, framework=framework, comments=comments, state=state, ) if relevance < 2.0: continue evidence_list.append( RetrievedEvidence( source_type=SourceType.GITHUB, title=title, snippet=snippet, url=url or None, score=relevance, ) ) evidence_list.sort(key=lambda x: x.score if x.score is not None else -1, reverse=True) return evidence_list[: (max_results or settings.MAX_GITHUB_RESULTS)]