hashan-7 commited on
Commit
f2d80fb
·
verified ·
1 Parent(s): 1ea7c95

update code

Browse files
Files changed (1) hide show
  1. github_search.py +156 -16
github_search.py CHANGED
@@ -1,4 +1,5 @@
1
  import requests
 
2
  from typing import List, Optional
3
 
4
  from config import settings
@@ -14,6 +15,29 @@ def clean_text(text: Optional[str]) -> str:
14
  return str(text).strip()
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def build_github_query(
18
  message: str,
19
  error_message: Optional[str] = None,
@@ -22,23 +46,114 @@ def build_github_query(
22
  ) -> str:
23
  parts = []
24
 
 
 
 
 
25
  if framework:
26
  parts.append(clean_text(framework))
27
 
28
  if language:
29
  parts.append(clean_text(language))
30
 
31
- if error_message:
32
- parts.append(f'"{clean_text(error_message)}"')
33
-
34
- if message:
35
- parts.append(clean_text(message))
 
 
 
36
 
37
  parts.append("is:issue")
38
 
39
  return " ".join(part for part in parts if part).strip()
40
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def search_github(
43
  message: str,
44
  error_message: Optional[str] = None,
@@ -65,9 +180,9 @@ def search_github(
65
 
66
  params = {
67
  "q": query,
68
- "sort": "reactions",
69
  "order": "desc",
70
- "per_page": max_results or settings.MAX_GITHUB_RESULTS,
71
  }
72
 
73
  try:
@@ -89,33 +204,58 @@ def search_github(
89
  for item in items:
90
  title = clean_text(item.get("title"))
91
  url = clean_text(item.get("html_url"))
92
- score = item.get("score")
93
  state = clean_text(item.get("state"))
 
 
94
  repo_full_name = clean_text(item.get("repository_url", "").split("/repos/")[-1])
95
 
 
 
 
96
  snippet_parts = []
97
  if repo_full_name:
98
  snippet_parts.append(f"Repo: {repo_full_name}")
99
  if state:
100
  snippet_parts.append(f"State: {state}")
101
-
102
- comments = item.get("comments")
103
- if comments is not None:
104
- snippet_parts.append(f"Comments: {comments}")
105
 
106
  snippet = " | ".join(snippet_parts)
107
 
108
- if not title:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  continue
110
 
111
  evidence_list.append(
112
  RetrievedEvidence(
113
  source_type=SourceType.GITHUB,
114
  title=title,
115
- snippet=snippet or "GitHub issue/discussion result",
116
  url=url or None,
117
- score=float(score) if score is not None else None,
118
  )
119
  )
120
 
121
- return evidence_list
 
 
1
  import requests
2
+ import re
3
  from typing import List, Optional
4
 
5
  from config import settings
 
15
  return str(text).strip()
16
 
17
 
18
+ def tokenize(text: Optional[str]) -> List[str]:
19
+ cleaned = clean_text(text).lower()
20
+ return re.findall(r"[a-zA-Z_][a-zA-Z0-9_+#.-]*", cleaned)
21
+
22
+
23
+ def extract_error_keywords(error_message: Optional[str]) -> List[str]:
24
+ if not error_message:
25
+ return []
26
+
27
+ raw = clean_text(error_message)
28
+ keywords = re.findall(r"[A-Za-z]+Error|[A-Za-z]+Exception", raw)
29
+ keywords.extend(tokenize(raw))
30
+
31
+ seen = set()
32
+ result = []
33
+ for item in keywords:
34
+ lower = item.lower()
35
+ if lower not in seen and len(lower) > 2:
36
+ seen.add(lower)
37
+ result.append(item)
38
+ return result[:8]
39
+
40
+
41
  def build_github_query(
42
  message: str,
43
  error_message: Optional[str] = None,
 
46
  ) -> str:
47
  parts = []
48
 
49
+ error_keywords = extract_error_keywords(error_message)
50
+ if error_keywords:
51
+ parts.append(f'"{error_keywords[0]}"')
52
+
53
  if framework:
54
  parts.append(clean_text(framework))
55
 
56
  if language:
57
  parts.append(clean_text(language))
58
 
59
+ message_tokens = tokenize(message)
60
+ filtered_message_tokens = [
61
+ token for token in message_tokens
62
+ if token.lower() not in {
63
+ "fix", "this", "code", "issue", "problem", "help", "please"
64
+ }
65
+ ]
66
+ parts.extend(filtered_message_tokens[:3])
67
 
68
  parts.append("is:issue")
69
 
70
  return " ".join(part for part in parts if part).strip()
71
 
72
 
73
+ def compute_github_relevance(
74
+ title: str,
75
+ snippet: str,
76
+ message: str,
77
+ error_message: Optional[str],
78
+ language: Optional[str],
79
+ framework: Optional[str],
80
+ comments: int,
81
+ state: str,
82
+ ) -> float:
83
+ title_l = clean_text(title).lower()
84
+ snippet_l = clean_text(snippet).lower()
85
+ relevance = 0.0
86
+
87
+ if state == "closed":
88
+ relevance += 1.5
89
+
90
+ relevance += min(comments, 20) * 0.15
91
+
92
+ if language and clean_text(language).lower() in title_l:
93
+ relevance += 2.0
94
+ if framework and clean_text(framework).lower() in title_l:
95
+ relevance += 3.0
96
+ if framework and clean_text(framework).lower() in snippet_l:
97
+ relevance += 1.5
98
+
99
+ error_keywords = extract_error_keywords(error_message)
100
+ for keyword in error_keywords[:4]:
101
+ k = keyword.lower()
102
+ if k in title_l:
103
+ relevance += 5.0
104
+ elif k in snippet_l:
105
+ relevance += 2.0
106
+
107
+ message_tokens = tokenize(message)
108
+ for token in message_tokens[:6]:
109
+ t = token.lower()
110
+ if len(t) < 4:
111
+ continue
112
+ if t in title_l:
113
+ relevance += 1.0
114
+ elif t in snippet_l:
115
+ relevance += 0.5
116
+
117
+ return relevance
118
+
119
+
120
+ def is_github_result_relevant(
121
+ title: str,
122
+ snippet: str,
123
+ message: str,
124
+ error_message: Optional[str],
125
+ language: Optional[str],
126
+ framework: Optional[str],
127
+ ) -> bool:
128
+ title_l = clean_text(title).lower()
129
+ snippet_l = clean_text(snippet).lower()
130
+
131
+ if framework and clean_text(framework).lower() in title_l:
132
+ return True
133
+ if framework and clean_text(framework).lower() in snippet_l:
134
+ return True
135
+
136
+ if language and clean_text(language).lower() in title_l:
137
+ return True
138
+
139
+ error_keywords = extract_error_keywords(error_message)
140
+ for keyword in error_keywords[:3]:
141
+ k = keyword.lower()
142
+ if k in title_l or k in snippet_l:
143
+ return True
144
+
145
+ message_tokens = tokenize(message)
146
+ matched = 0
147
+ for token in message_tokens[:6]:
148
+ t = token.lower()
149
+ if len(t) < 4:
150
+ continue
151
+ if t in title_l or t in snippet_l:
152
+ matched += 1
153
+
154
+ return matched >= 2
155
+
156
+
157
  def search_github(
158
  message: str,
159
  error_message: Optional[str] = None,
 
180
 
181
  params = {
182
  "q": query,
183
+ "sort": "updated",
184
  "order": "desc",
185
+ "per_page": max((max_results or settings.MAX_GITHUB_RESULTS) * 2, 6),
186
  }
187
 
188
  try:
 
204
  for item in items:
205
  title = clean_text(item.get("title"))
206
  url = clean_text(item.get("html_url"))
 
207
  state = clean_text(item.get("state"))
208
+ comments = int(item.get("comments", 0))
209
+ body = clean_text(item.get("body", ""))[:500]
210
  repo_full_name = clean_text(item.get("repository_url", "").split("/repos/")[-1])
211
 
212
+ if not title:
213
+ continue
214
+
215
  snippet_parts = []
216
  if repo_full_name:
217
  snippet_parts.append(f"Repo: {repo_full_name}")
218
  if state:
219
  snippet_parts.append(f"State: {state}")
220
+ snippet_parts.append(f"Comments: {comments}")
221
+ if body:
222
+ snippet_parts.append(f"Body: {body}")
 
223
 
224
  snippet = " | ".join(snippet_parts)
225
 
226
+ if not is_github_result_relevant(
227
+ title=title,
228
+ snippet=snippet,
229
+ message=message,
230
+ error_message=error_message,
231
+ language=language,
232
+ framework=framework,
233
+ ):
234
+ continue
235
+
236
+ relevance = compute_github_relevance(
237
+ title=title,
238
+ snippet=snippet,
239
+ message=message,
240
+ error_message=error_message,
241
+ language=language,
242
+ framework=framework,
243
+ comments=comments,
244
+ state=state,
245
+ )
246
+
247
+ if relevance < 2.0:
248
  continue
249
 
250
  evidence_list.append(
251
  RetrievedEvidence(
252
  source_type=SourceType.GITHUB,
253
  title=title,
254
+ snippet=snippet,
255
  url=url or None,
256
+ score=relevance,
257
  )
258
  )
259
 
260
+ evidence_list.sort(key=lambda x: x.score if x.score is not None else -1, reverse=True)
261
+ return evidence_list[: (max_results or settings.MAX_GITHUB_RESULTS)]