| import html |
| import re |
| import requests |
| from typing import List, Optional |
|
|
| from config import settings |
| from schemas import RetrievedEvidence, SourceType |
|
|
|
|
| STACK_API_BASE = "https://api.stackexchange.com/2.3/search/advanced" |
| STACK_SITE = "stackoverflow" |
|
|
|
|
| def clean_text(text: Optional[str]) -> str: |
| if not text: |
| return "" |
| text = html.unescape(str(text)) |
| text = re.sub(r"<[^>]+>", " ", text) |
| text = re.sub(r"\s+", " ", text) |
| return text.strip() |
|
|
|
|
| def tokenize(text: Optional[str]) -> List[str]: |
| cleaned = clean_text(text).lower() |
| return re.findall(r"[a-zA-Z_][a-zA-Z0-9_+#.-]*", cleaned) |
|
|
|
|
| def extract_error_keywords(error_message: Optional[str]) -> List[str]: |
| if not error_message: |
| return [] |
|
|
| keywords = [] |
| cleaned = clean_text(error_message) |
|
|
| exact_error_match = re.findall(r"[A-Za-z]+Error|[A-Za-z]+Exception", cleaned) |
| keywords.extend(exact_error_match) |
|
|
| tokens = tokenize(cleaned) |
| keywords.extend(tokens) |
|
|
| seen = set() |
| result = [] |
| for item in keywords: |
| lower = item.lower() |
| if lower not in seen and len(lower) > 2: |
| seen.add(lower) |
| result.append(item) |
| return result[:8] |
|
|
|
|
| def extract_code_keywords(code: Optional[str]) -> List[str]: |
| if not code: |
| return [] |
|
|
| interesting = [] |
| patterns = [ |
| r"\bdef\s+([A-Za-z_][A-Za-z0-9_]*)", |
| r"\bclass\s+([A-Za-z_][A-Za-z0-9_]*)", |
| r"\bimport\s+([A-Za-z_][A-Za-z0-9_.]*)", |
| r"\bfrom\s+([A-Za-z_][A-Za-z0-9_.]*)\s+import\b", |
| ] |
|
|
| for pattern in patterns: |
| for match in re.findall(pattern, code): |
| interesting.append(match) |
|
|
| seen = set() |
| result = [] |
| for item in interesting: |
| lower = item.lower() |
| if lower not in seen: |
| seen.add(lower) |
| result.append(item) |
| return result[:5] |
|
|
|
|
| def build_stack_query( |
| message: str, |
| error_message: Optional[str] = None, |
| language: Optional[str] = None, |
| framework: Optional[str] = None, |
| code: Optional[str] = None, |
| ) -> str: |
| parts: List[str] = [] |
|
|
| error_keywords = extract_error_keywords(error_message) |
| if error_keywords: |
| parts.append(f'"{error_keywords[0]}"') |
|
|
| if framework: |
| parts.append(clean_text(framework)) |
|
|
| if language: |
| parts.append(clean_text(language)) |
|
|
| code_keywords = extract_code_keywords(code) |
| parts.extend(code_keywords[:2]) |
|
|
| message_tokens = tokenize(message) |
| important_message_tokens = [ |
| token for token in message_tokens |
| if token.lower() not in { |
| "fix", "this", "code", "issue", "problem", "help", "please", |
| "python", "javascript", "java", "flutter", "react" |
| } |
| ] |
| parts.extend(important_message_tokens[:3]) |
|
|
| query = " ".join(part for part in parts if part) |
| return query.strip() |
|
|
|
|
| def compute_stack_relevance( |
| title: str, |
| tags: List[str], |
| snippet: str, |
| message: str, |
| error_message: Optional[str], |
| language: Optional[str], |
| framework: Optional[str], |
| score: int, |
| is_answered: bool, |
| ) -> float: |
| title_l = clean_text(title).lower() |
| snippet_l = clean_text(snippet).lower() |
| tags_l = [clean_text(tag).lower() for tag in tags] |
| base = float(score if score is not None else 0) |
|
|
| relevance = 0.0 |
|
|
| if is_answered: |
| relevance += 2.0 |
|
|
| relevance += min(base, 10.0) * 0.4 |
|
|
| if language and clean_text(language).lower() in title_l: |
| relevance += 3.0 |
| if language and clean_text(language).lower() in tags_l: |
| relevance += 4.0 |
|
|
| if framework and clean_text(framework).lower() in title_l: |
| relevance += 3.0 |
| if framework and clean_text(framework).lower() in tags_l: |
| relevance += 4.0 |
|
|
| error_keywords = extract_error_keywords(error_message) |
| for keyword in error_keywords[:4]: |
| k = keyword.lower() |
| if k in title_l: |
| relevance += 6.0 |
| elif k in snippet_l: |
| relevance += 3.0 |
|
|
| message_tokens = tokenize(message) |
| for token in message_tokens[:6]: |
| t = token.lower() |
| if len(t) < 4: |
| continue |
| if t in title_l: |
| relevance += 1.5 |
| elif t in snippet_l: |
| relevance += 0.75 |
|
|
| return relevance |
|
|
|
|
| def search_stackoverflow( |
| message: str, |
| error_message: Optional[str] = None, |
| language: Optional[str] = None, |
| framework: Optional[str] = None, |
| code: Optional[str] = None, |
| max_results: Optional[int] = None, |
| ) -> List[RetrievedEvidence]: |
| query = build_stack_query( |
| message=message, |
| error_message=error_message, |
| language=language, |
| framework=framework, |
| code=code, |
| ) |
|
|
| if not query: |
| return [] |
|
|
| params = { |
| "order": "desc", |
| "sort": "relevance", |
| "q": query, |
| "site": STACK_SITE, |
| "pagesize": max((max_results or settings.MAX_STACK_RESULTS) * 2, 6), |
| "filter": "default", |
| } |
|
|
| if settings.STACKOVERFLOW_KEY: |
| params["key"] = settings.STACKOVERFLOW_KEY |
|
|
| try: |
| response = requests.get( |
| STACK_API_BASE, |
| params=params, |
| timeout=settings.SEARCH_TIMEOUT_SECONDS, |
| ) |
| response.raise_for_status() |
| data = response.json() |
| except Exception as e: |
| print(f"Stack Overflow search failed: {e}") |
| return [] |
|
|
| items = data.get("items", []) |
| evidence_list: List[RetrievedEvidence] = [] |
|
|
| for item in items: |
| title = clean_text(item.get("title")) |
| link = clean_text(item.get("link")) |
| score = item.get("score", 0) |
| tags = item.get("tags", []) or [] |
| is_answered = item.get("is_answered", False) |
|
|
| if not title: |
| continue |
|
|
| snippet_parts = [] |
| if tags: |
| snippet_parts.append(f"Tags: {', '.join(tags)}") |
| snippet_parts.append(f"Answered: {'yes' if is_answered else 'no'}") |
| snippet_parts.append(f"Score: {score}") |
|
|
| snippet = " | ".join(snippet_parts) |
|
|
| relevance = compute_stack_relevance( |
| title=title, |
| tags=tags, |
| snippet=snippet, |
| message=message, |
| error_message=error_message, |
| language=language, |
| framework=framework, |
| score=score, |
| is_answered=is_answered, |
| ) |
|
|
| if relevance < 2.0: |
| continue |
|
|
| evidence_list.append( |
| RetrievedEvidence( |
| source_type=SourceType.STACKOVERFLOW, |
| title=title, |
| snippet=snippet, |
| url=link or None, |
| score=relevance, |
| ) |
| ) |
|
|
| evidence_list.sort(key=lambda x: x.score if x.score is not None else -1, reverse=True) |
| return evidence_list[: (max_results or settings.MAX_STACK_RESULTS)] |