Chat7-CodeX-Backend / github_search.py
hashan-7's picture
update code
f2d80fb verified
import requests
import re
from typing import List, Optional
from config import settings
from schemas import RetrievedEvidence, SourceType
GITHUB_SEARCH_API = "https://api.github.com/search/issues"
def clean_text(text: Optional[str]) -> str:
if not text:
return ""
return str(text).strip()
def tokenize(text: Optional[str]) -> List[str]:
cleaned = clean_text(text).lower()
return re.findall(r"[a-zA-Z_][a-zA-Z0-9_+#.-]*", cleaned)
def extract_error_keywords(error_message: Optional[str]) -> List[str]:
if not error_message:
return []
raw = clean_text(error_message)
keywords = re.findall(r"[A-Za-z]+Error|[A-Za-z]+Exception", raw)
keywords.extend(tokenize(raw))
seen = set()
result = []
for item in keywords:
lower = item.lower()
if lower not in seen and len(lower) > 2:
seen.add(lower)
result.append(item)
return result[:8]
def build_github_query(
message: str,
error_message: Optional[str] = None,
language: Optional[str] = None,
framework: Optional[str] = None,
) -> str:
parts = []
error_keywords = extract_error_keywords(error_message)
if error_keywords:
parts.append(f'"{error_keywords[0]}"')
if framework:
parts.append(clean_text(framework))
if language:
parts.append(clean_text(language))
message_tokens = tokenize(message)
filtered_message_tokens = [
token for token in message_tokens
if token.lower() not in {
"fix", "this", "code", "issue", "problem", "help", "please"
}
]
parts.extend(filtered_message_tokens[:3])
parts.append("is:issue")
return " ".join(part for part in parts if part).strip()
def compute_github_relevance(
title: str,
snippet: str,
message: str,
error_message: Optional[str],
language: Optional[str],
framework: Optional[str],
comments: int,
state: str,
) -> float:
title_l = clean_text(title).lower()
snippet_l = clean_text(snippet).lower()
relevance = 0.0
if state == "closed":
relevance += 1.5
relevance += min(comments, 20) * 0.15
if language and clean_text(language).lower() in title_l:
relevance += 2.0
if framework and clean_text(framework).lower() in title_l:
relevance += 3.0
if framework and clean_text(framework).lower() in snippet_l:
relevance += 1.5
error_keywords = extract_error_keywords(error_message)
for keyword in error_keywords[:4]:
k = keyword.lower()
if k in title_l:
relevance += 5.0
elif k in snippet_l:
relevance += 2.0
message_tokens = tokenize(message)
for token in message_tokens[:6]:
t = token.lower()
if len(t) < 4:
continue
if t in title_l:
relevance += 1.0
elif t in snippet_l:
relevance += 0.5
return relevance
def is_github_result_relevant(
title: str,
snippet: str,
message: str,
error_message: Optional[str],
language: Optional[str],
framework: Optional[str],
) -> bool:
title_l = clean_text(title).lower()
snippet_l = clean_text(snippet).lower()
if framework and clean_text(framework).lower() in title_l:
return True
if framework and clean_text(framework).lower() in snippet_l:
return True
if language and clean_text(language).lower() in title_l:
return True
error_keywords = extract_error_keywords(error_message)
for keyword in error_keywords[:3]:
k = keyword.lower()
if k in title_l or k in snippet_l:
return True
message_tokens = tokenize(message)
matched = 0
for token in message_tokens[:6]:
t = token.lower()
if len(t) < 4:
continue
if t in title_l or t in snippet_l:
matched += 1
return matched >= 2
def search_github(
message: str,
error_message: Optional[str] = None,
language: Optional[str] = None,
framework: Optional[str] = None,
max_results: Optional[int] = None,
) -> List[RetrievedEvidence]:
query = build_github_query(
message=message,
error_message=error_message,
language=language,
framework=framework,
)
if not query:
return []
headers = {
"Accept": "application/vnd.github+json",
}
if settings.GITHUB_TOKEN:
headers["Authorization"] = f"Bearer {settings.GITHUB_TOKEN}"
params = {
"q": query,
"sort": "updated",
"order": "desc",
"per_page": max((max_results or settings.MAX_GITHUB_RESULTS) * 2, 6),
}
try:
response = requests.get(
GITHUB_SEARCH_API,
headers=headers,
params=params,
timeout=settings.SEARCH_TIMEOUT_SECONDS,
)
response.raise_for_status()
data = response.json()
except Exception as e:
print(f"GitHub search failed: {e}")
return []
items = data.get("items", [])
evidence_list: List[RetrievedEvidence] = []
for item in items:
title = clean_text(item.get("title"))
url = clean_text(item.get("html_url"))
state = clean_text(item.get("state"))
comments = int(item.get("comments", 0))
body = clean_text(item.get("body", ""))[:500]
repo_full_name = clean_text(item.get("repository_url", "").split("/repos/")[-1])
if not title:
continue
snippet_parts = []
if repo_full_name:
snippet_parts.append(f"Repo: {repo_full_name}")
if state:
snippet_parts.append(f"State: {state}")
snippet_parts.append(f"Comments: {comments}")
if body:
snippet_parts.append(f"Body: {body}")
snippet = " | ".join(snippet_parts)
if not is_github_result_relevant(
title=title,
snippet=snippet,
message=message,
error_message=error_message,
language=language,
framework=framework,
):
continue
relevance = compute_github_relevance(
title=title,
snippet=snippet,
message=message,
error_message=error_message,
language=language,
framework=framework,
comments=comments,
state=state,
)
if relevance < 2.0:
continue
evidence_list.append(
RetrievedEvidence(
source_type=SourceType.GITHUB,
title=title,
snippet=snippet,
url=url or None,
score=relevance,
)
)
evidence_list.sort(key=lambda x: x.score if x.score is not None else -1, reverse=True)
return evidence_list[: (max_results or settings.MAX_GITHUB_RESULTS)]