| import requests |
| import re |
| from typing import List, Optional |
|
|
| from config import settings |
| from schemas import RetrievedEvidence, SourceType |
|
|
|
|
| GITHUB_SEARCH_API = "https://api.github.com/search/issues" |
|
|
|
|
| def clean_text(text: Optional[str]) -> str: |
| if not text: |
| return "" |
| return str(text).strip() |
|
|
|
|
| def tokenize(text: Optional[str]) -> List[str]: |
| cleaned = clean_text(text).lower() |
| return re.findall(r"[a-zA-Z_][a-zA-Z0-9_+#.-]*", cleaned) |
|
|
|
|
| def extract_error_keywords(error_message: Optional[str]) -> List[str]: |
| if not error_message: |
| return [] |
|
|
| raw = clean_text(error_message) |
| keywords = re.findall(r"[A-Za-z]+Error|[A-Za-z]+Exception", raw) |
| keywords.extend(tokenize(raw)) |
|
|
| seen = set() |
| result = [] |
| for item in keywords: |
| lower = item.lower() |
| if lower not in seen and len(lower) > 2: |
| seen.add(lower) |
| result.append(item) |
| return result[:8] |
|
|
|
|
| def build_github_query( |
| message: str, |
| error_message: Optional[str] = None, |
| language: Optional[str] = None, |
| framework: Optional[str] = None, |
| ) -> str: |
| parts = [] |
|
|
| error_keywords = extract_error_keywords(error_message) |
| if error_keywords: |
| parts.append(f'"{error_keywords[0]}"') |
|
|
| if framework: |
| parts.append(clean_text(framework)) |
|
|
| if language: |
| parts.append(clean_text(language)) |
|
|
| message_tokens = tokenize(message) |
| filtered_message_tokens = [ |
| token for token in message_tokens |
| if token.lower() not in { |
| "fix", "this", "code", "issue", "problem", "help", "please" |
| } |
| ] |
| parts.extend(filtered_message_tokens[:3]) |
|
|
| parts.append("is:issue") |
|
|
| return " ".join(part for part in parts if part).strip() |
|
|
|
|
| def compute_github_relevance( |
| title: str, |
| snippet: str, |
| message: str, |
| error_message: Optional[str], |
| language: Optional[str], |
| framework: Optional[str], |
| comments: int, |
| state: str, |
| ) -> float: |
| title_l = clean_text(title).lower() |
| snippet_l = clean_text(snippet).lower() |
| relevance = 0.0 |
|
|
| if state == "closed": |
| relevance += 1.5 |
|
|
| relevance += min(comments, 20) * 0.15 |
|
|
| if language and clean_text(language).lower() in title_l: |
| relevance += 2.0 |
| if framework and clean_text(framework).lower() in title_l: |
| relevance += 3.0 |
| if framework and clean_text(framework).lower() in snippet_l: |
| relevance += 1.5 |
|
|
| error_keywords = extract_error_keywords(error_message) |
| for keyword in error_keywords[:4]: |
| k = keyword.lower() |
| if k in title_l: |
| relevance += 5.0 |
| elif k in snippet_l: |
| relevance += 2.0 |
|
|
| message_tokens = tokenize(message) |
| for token in message_tokens[:6]: |
| t = token.lower() |
| if len(t) < 4: |
| continue |
| if t in title_l: |
| relevance += 1.0 |
| elif t in snippet_l: |
| relevance += 0.5 |
|
|
| return relevance |
|
|
|
|
| def is_github_result_relevant( |
| title: str, |
| snippet: str, |
| message: str, |
| error_message: Optional[str], |
| language: Optional[str], |
| framework: Optional[str], |
| ) -> bool: |
| title_l = clean_text(title).lower() |
| snippet_l = clean_text(snippet).lower() |
|
|
| if framework and clean_text(framework).lower() in title_l: |
| return True |
| if framework and clean_text(framework).lower() in snippet_l: |
| return True |
|
|
| if language and clean_text(language).lower() in title_l: |
| return True |
|
|
| error_keywords = extract_error_keywords(error_message) |
| for keyword in error_keywords[:3]: |
| k = keyword.lower() |
| if k in title_l or k in snippet_l: |
| return True |
|
|
| message_tokens = tokenize(message) |
| matched = 0 |
| for token in message_tokens[:6]: |
| t = token.lower() |
| if len(t) < 4: |
| continue |
| if t in title_l or t in snippet_l: |
| matched += 1 |
|
|
| return matched >= 2 |
|
|
|
|
| def search_github( |
| message: str, |
| error_message: Optional[str] = None, |
| language: Optional[str] = None, |
| framework: Optional[str] = None, |
| max_results: Optional[int] = None, |
| ) -> List[RetrievedEvidence]: |
| query = build_github_query( |
| message=message, |
| error_message=error_message, |
| language=language, |
| framework=framework, |
| ) |
|
|
| if not query: |
| return [] |
|
|
| headers = { |
| "Accept": "application/vnd.github+json", |
| } |
|
|
| if settings.GITHUB_TOKEN: |
| headers["Authorization"] = f"Bearer {settings.GITHUB_TOKEN}" |
|
|
| params = { |
| "q": query, |
| "sort": "updated", |
| "order": "desc", |
| "per_page": max((max_results or settings.MAX_GITHUB_RESULTS) * 2, 6), |
| } |
|
|
| try: |
| response = requests.get( |
| GITHUB_SEARCH_API, |
| headers=headers, |
| params=params, |
| timeout=settings.SEARCH_TIMEOUT_SECONDS, |
| ) |
| response.raise_for_status() |
| data = response.json() |
| except Exception as e: |
| print(f"GitHub search failed: {e}") |
| return [] |
|
|
| items = data.get("items", []) |
| evidence_list: List[RetrievedEvidence] = [] |
|
|
| for item in items: |
| title = clean_text(item.get("title")) |
| url = clean_text(item.get("html_url")) |
| state = clean_text(item.get("state")) |
| comments = int(item.get("comments", 0)) |
| body = clean_text(item.get("body", ""))[:500] |
| repo_full_name = clean_text(item.get("repository_url", "").split("/repos/")[-1]) |
|
|
| if not title: |
| continue |
|
|
| snippet_parts = [] |
| if repo_full_name: |
| snippet_parts.append(f"Repo: {repo_full_name}") |
| if state: |
| snippet_parts.append(f"State: {state}") |
| snippet_parts.append(f"Comments: {comments}") |
| if body: |
| snippet_parts.append(f"Body: {body}") |
|
|
| snippet = " | ".join(snippet_parts) |
|
|
| if not is_github_result_relevant( |
| title=title, |
| snippet=snippet, |
| message=message, |
| error_message=error_message, |
| language=language, |
| framework=framework, |
| ): |
| continue |
|
|
| relevance = compute_github_relevance( |
| title=title, |
| snippet=snippet, |
| message=message, |
| error_message=error_message, |
| language=language, |
| framework=framework, |
| comments=comments, |
| state=state, |
| ) |
|
|
| if relevance < 2.0: |
| continue |
|
|
| evidence_list.append( |
| RetrievedEvidence( |
| source_type=SourceType.GITHUB, |
| title=title, |
| snippet=snippet, |
| url=url or None, |
| score=relevance, |
| ) |
| ) |
|
|
| evidence_list.sort(key=lambda x: x.score if x.score is not None else -1, reverse=True) |
| return evidence_list[: (max_results or settings.MAX_GITHUB_RESULTS)] |