hashan-7 commited on
Commit
73a75a4
·
verified ·
1 Parent(s): f2d80fb

update code

Browse files
Files changed (1) hide show
  1. stack_search.py +156 -15
stack_search.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import requests
2
  from typing import List, Optional
3
 
@@ -12,7 +14,64 @@ STACK_SITE = "stackoverflow"
12
  def clean_text(text: Optional[str]) -> str:
13
  if not text:
14
  return ""
15
- return str(text).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
 
18
  def build_stack_query(
@@ -20,8 +79,13 @@ def build_stack_query(
20
  error_message: Optional[str] = None,
21
  language: Optional[str] = None,
22
  framework: Optional[str] = None,
 
23
  ) -> str:
24
- parts = []
 
 
 
 
25
 
26
  if framework:
27
  parts.append(clean_text(framework))
@@ -29,21 +93,83 @@ def build_stack_query(
29
  if language:
30
  parts.append(clean_text(language))
31
 
32
- if error_message:
33
- parts.append(clean_text(error_message))
34
 
35
- if message:
36
- parts.append(clean_text(message))
 
 
 
 
 
 
 
37
 
38
  query = " ".join(part for part in parts if part)
39
  return query.strip()
40
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def search_stackoverflow(
43
  message: str,
44
  error_message: Optional[str] = None,
45
  language: Optional[str] = None,
46
  framework: Optional[str] = None,
 
47
  max_results: Optional[int] = None,
48
  ) -> List[RetrievedEvidence]:
49
  query = build_stack_query(
@@ -51,6 +177,7 @@ def search_stackoverflow(
51
  error_message=error_message,
52
  language=language,
53
  framework=framework,
 
54
  )
55
 
56
  if not query:
@@ -61,7 +188,7 @@ def search_stackoverflow(
61
  "sort": "relevance",
62
  "q": query,
63
  "site": STACK_SITE,
64
- "pagesize": max_results or settings.MAX_STACK_RESULTS,
65
  "filter": "default",
66
  }
67
 
@@ -87,20 +214,33 @@ def search_stackoverflow(
87
  title = clean_text(item.get("title"))
88
  link = clean_text(item.get("link"))
89
  score = item.get("score", 0)
90
-
91
- tags = item.get("tags", [])
92
- tag_text = ", ".join(tags) if tags else ""
93
  is_answered = item.get("is_answered", False)
94
 
 
 
 
95
  snippet_parts = []
96
- if tag_text:
97
- snippet_parts.append(f"Tags: {tag_text}")
98
  snippet_parts.append(f"Answered: {'yes' if is_answered else 'no'}")
99
  snippet_parts.append(f"Score: {score}")
100
 
101
  snippet = " | ".join(snippet_parts)
102
 
103
- if not title:
 
 
 
 
 
 
 
 
 
 
 
 
104
  continue
105
 
106
  evidence_list.append(
@@ -109,8 +249,9 @@ def search_stackoverflow(
109
  title=title,
110
  snippet=snippet,
111
  url=link or None,
112
- score=float(score) if score is not None else None,
113
  )
114
  )
115
 
116
- return evidence_list
 
 
1
+ import html
2
+ import re
3
  import requests
4
  from typing import List, Optional
5
 
 
14
  def clean_text(text: Optional[str]) -> str:
15
  if not text:
16
  return ""
17
+ text = html.unescape(str(text))
18
+ text = re.sub(r"<[^>]+>", " ", text)
19
+ text = re.sub(r"\s+", " ", text)
20
+ return text.strip()
21
+
22
+
23
+ def tokenize(text: Optional[str]) -> List[str]:
24
+ cleaned = clean_text(text).lower()
25
+ return re.findall(r"[a-zA-Z_][a-zA-Z0-9_+#.-]*", cleaned)
26
+
27
+
28
+ def extract_error_keywords(error_message: Optional[str]) -> List[str]:
29
+ if not error_message:
30
+ return []
31
+
32
+ keywords = []
33
+ cleaned = clean_text(error_message)
34
+
35
+ exact_error_match = re.findall(r"[A-Za-z]+Error|[A-Za-z]+Exception", cleaned)
36
+ keywords.extend(exact_error_match)
37
+
38
+ tokens = tokenize(cleaned)
39
+ keywords.extend(tokens)
40
+
41
+ seen = set()
42
+ result = []
43
+ for item in keywords:
44
+ lower = item.lower()
45
+ if lower not in seen and len(lower) > 2:
46
+ seen.add(lower)
47
+ result.append(item)
48
+ return result[:8]
49
+
50
+
51
+ def extract_code_keywords(code: Optional[str]) -> List[str]:
52
+ if not code:
53
+ return []
54
+
55
+ interesting = []
56
+ patterns = [
57
+ r"\bdef\s+([A-Za-z_][A-Za-z0-9_]*)",
58
+ r"\bclass\s+([A-Za-z_][A-Za-z0-9_]*)",
59
+ r"\bimport\s+([A-Za-z_][A-Za-z0-9_.]*)",
60
+ r"\bfrom\s+([A-Za-z_][A-Za-z0-9_.]*)\s+import\b",
61
+ ]
62
+
63
+ for pattern in patterns:
64
+ for match in re.findall(pattern, code):
65
+ interesting.append(match)
66
+
67
+ seen = set()
68
+ result = []
69
+ for item in interesting:
70
+ lower = item.lower()
71
+ if lower not in seen:
72
+ seen.add(lower)
73
+ result.append(item)
74
+ return result[:5]
75
 
76
 
77
  def build_stack_query(
 
79
  error_message: Optional[str] = None,
80
  language: Optional[str] = None,
81
  framework: Optional[str] = None,
82
+ code: Optional[str] = None,
83
  ) -> str:
84
+ parts: List[str] = []
85
+
86
+ error_keywords = extract_error_keywords(error_message)
87
+ if error_keywords:
88
+ parts.append(f'"{error_keywords[0]}"')
89
 
90
  if framework:
91
  parts.append(clean_text(framework))
 
93
  if language:
94
  parts.append(clean_text(language))
95
 
96
+ code_keywords = extract_code_keywords(code)
97
+ parts.extend(code_keywords[:2])
98
 
99
+ message_tokens = tokenize(message)
100
+ important_message_tokens = [
101
+ token for token in message_tokens
102
+ if token.lower() not in {
103
+ "fix", "this", "code", "issue", "problem", "help", "please",
104
+ "python", "javascript", "java", "flutter", "react"
105
+ }
106
+ ]
107
+ parts.extend(important_message_tokens[:3])
108
 
109
  query = " ".join(part for part in parts if part)
110
  return query.strip()
111
 
112
 
113
+ def compute_stack_relevance(
114
+ title: str,
115
+ tags: List[str],
116
+ snippet: str,
117
+ message: str,
118
+ error_message: Optional[str],
119
+ language: Optional[str],
120
+ framework: Optional[str],
121
+ score: int,
122
+ is_answered: bool,
123
+ ) -> float:
124
+ title_l = clean_text(title).lower()
125
+ snippet_l = clean_text(snippet).lower()
126
+ tags_l = [clean_text(tag).lower() for tag in tags]
127
+ base = float(score if score is not None else 0)
128
+
129
+ relevance = 0.0
130
+
131
+ if is_answered:
132
+ relevance += 2.0
133
+
134
+ relevance += min(base, 10.0) * 0.4
135
+
136
+ if language and clean_text(language).lower() in title_l:
137
+ relevance += 3.0
138
+ if language and clean_text(language).lower() in tags_l:
139
+ relevance += 4.0
140
+
141
+ if framework and clean_text(framework).lower() in title_l:
142
+ relevance += 3.0
143
+ if framework and clean_text(framework).lower() in tags_l:
144
+ relevance += 4.0
145
+
146
+ error_keywords = extract_error_keywords(error_message)
147
+ for keyword in error_keywords[:4]:
148
+ k = keyword.lower()
149
+ if k in title_l:
150
+ relevance += 6.0
151
+ elif k in snippet_l:
152
+ relevance += 3.0
153
+
154
+ message_tokens = tokenize(message)
155
+ for token in message_tokens[:6]:
156
+ t = token.lower()
157
+ if len(t) < 4:
158
+ continue
159
+ if t in title_l:
160
+ relevance += 1.5
161
+ elif t in snippet_l:
162
+ relevance += 0.75
163
+
164
+ return relevance
165
+
166
+
167
  def search_stackoverflow(
168
  message: str,
169
  error_message: Optional[str] = None,
170
  language: Optional[str] = None,
171
  framework: Optional[str] = None,
172
+ code: Optional[str] = None,
173
  max_results: Optional[int] = None,
174
  ) -> List[RetrievedEvidence]:
175
  query = build_stack_query(
 
177
  error_message=error_message,
178
  language=language,
179
  framework=framework,
180
+ code=code,
181
  )
182
 
183
  if not query:
 
188
  "sort": "relevance",
189
  "q": query,
190
  "site": STACK_SITE,
191
+ "pagesize": max((max_results or settings.MAX_STACK_RESULTS) * 2, 6),
192
  "filter": "default",
193
  }
194
 
 
214
  title = clean_text(item.get("title"))
215
  link = clean_text(item.get("link"))
216
  score = item.get("score", 0)
217
+ tags = item.get("tags", []) or []
 
 
218
  is_answered = item.get("is_answered", False)
219
 
220
+ if not title:
221
+ continue
222
+
223
  snippet_parts = []
224
+ if tags:
225
+ snippet_parts.append(f"Tags: {', '.join(tags)}")
226
  snippet_parts.append(f"Answered: {'yes' if is_answered else 'no'}")
227
  snippet_parts.append(f"Score: {score}")
228
 
229
  snippet = " | ".join(snippet_parts)
230
 
231
+ relevance = compute_stack_relevance(
232
+ title=title,
233
+ tags=tags,
234
+ snippet=snippet,
235
+ message=message,
236
+ error_message=error_message,
237
+ language=language,
238
+ framework=framework,
239
+ score=score,
240
+ is_answered=is_answered,
241
+ )
242
+
243
+ if relevance < 2.0:
244
  continue
245
 
246
  evidence_list.append(
 
249
  title=title,
250
  snippet=snippet,
251
  url=link or None,
252
+ score=relevance,
253
  )
254
  )
255
 
256
+ evidence_list.sort(key=lambda x: x.score if x.score is not None else -1, reverse=True)
257
+ return evidence_list[: (max_results or settings.MAX_STACK_RESULTS)]