import html import re import requests from typing import List, Optional from config import settings from schemas import RetrievedEvidence, SourceType STACK_API_BASE = "https://api.stackexchange.com/2.3/search/advanced" STACK_SITE = "stackoverflow" def clean_text(text: Optional[str]) -> str: if not text: return "" text = html.unescape(str(text)) text = re.sub(r"<[^>]+>", " ", text) text = re.sub(r"\s+", " ", text) return text.strip() def tokenize(text: Optional[str]) -> List[str]: cleaned = clean_text(text).lower() return re.findall(r"[a-zA-Z_][a-zA-Z0-9_+#.-]*", cleaned) def extract_error_keywords(error_message: Optional[str]) -> List[str]: if not error_message: return [] keywords = [] cleaned = clean_text(error_message) exact_error_match = re.findall(r"[A-Za-z]+Error|[A-Za-z]+Exception", cleaned) keywords.extend(exact_error_match) tokens = tokenize(cleaned) keywords.extend(tokens) seen = set() result = [] for item in keywords: lower = item.lower() if lower not in seen and len(lower) > 2: seen.add(lower) result.append(item) return result[:8] def extract_code_keywords(code: Optional[str]) -> List[str]: if not code: return [] interesting = [] patterns = [ r"\bdef\s+([A-Za-z_][A-Za-z0-9_]*)", r"\bclass\s+([A-Za-z_][A-Za-z0-9_]*)", r"\bimport\s+([A-Za-z_][A-Za-z0-9_.]*)", r"\bfrom\s+([A-Za-z_][A-Za-z0-9_.]*)\s+import\b", ] for pattern in patterns: for match in re.findall(pattern, code): interesting.append(match) seen = set() result = [] for item in interesting: lower = item.lower() if lower not in seen: seen.add(lower) result.append(item) return result[:5] def build_stack_query( message: str, error_message: Optional[str] = None, language: Optional[str] = None, framework: Optional[str] = None, code: Optional[str] = None, ) -> str: parts: List[str] = [] error_keywords = extract_error_keywords(error_message) if error_keywords: parts.append(f'"{error_keywords[0]}"') if framework: parts.append(clean_text(framework)) if language: parts.append(clean_text(language)) code_keywords = extract_code_keywords(code) parts.extend(code_keywords[:2]) message_tokens = tokenize(message) important_message_tokens = [ token for token in message_tokens if token.lower() not in { "fix", "this", "code", "issue", "problem", "help", "please", "python", "javascript", "java", "flutter", "react" } ] parts.extend(important_message_tokens[:3]) query = " ".join(part for part in parts if part) return query.strip() def compute_stack_relevance( title: str, tags: List[str], snippet: str, message: str, error_message: Optional[str], language: Optional[str], framework: Optional[str], score: int, is_answered: bool, ) -> float: title_l = clean_text(title).lower() snippet_l = clean_text(snippet).lower() tags_l = [clean_text(tag).lower() for tag in tags] base = float(score if score is not None else 0) relevance = 0.0 if is_answered: relevance += 2.0 relevance += min(base, 10.0) * 0.4 if language and clean_text(language).lower() in title_l: relevance += 3.0 if language and clean_text(language).lower() in tags_l: relevance += 4.0 if framework and clean_text(framework).lower() in title_l: relevance += 3.0 if framework and clean_text(framework).lower() in tags_l: relevance += 4.0 error_keywords = extract_error_keywords(error_message) for keyword in error_keywords[:4]: k = keyword.lower() if k in title_l: relevance += 6.0 elif k in snippet_l: relevance += 3.0 message_tokens = tokenize(message) for token in message_tokens[:6]: t = token.lower() if len(t) < 4: continue if t in title_l: relevance += 1.5 elif t in snippet_l: relevance += 0.75 return relevance def search_stackoverflow( message: str, error_message: Optional[str] = None, language: Optional[str] = None, framework: Optional[str] = None, code: Optional[str] = None, max_results: Optional[int] = None, ) -> List[RetrievedEvidence]: query = build_stack_query( message=message, error_message=error_message, language=language, framework=framework, code=code, ) if not query: return [] params = { "order": "desc", "sort": "relevance", "q": query, "site": STACK_SITE, "pagesize": max((max_results or settings.MAX_STACK_RESULTS) * 2, 6), "filter": "default", } if settings.STACKOVERFLOW_KEY: params["key"] = settings.STACKOVERFLOW_KEY try: response = requests.get( STACK_API_BASE, params=params, timeout=settings.SEARCH_TIMEOUT_SECONDS, ) response.raise_for_status() data = response.json() except Exception as e: print(f"Stack Overflow search failed: {e}") return [] items = data.get("items", []) evidence_list: List[RetrievedEvidence] = [] for item in items: title = clean_text(item.get("title")) link = clean_text(item.get("link")) score = item.get("score", 0) tags = item.get("tags", []) or [] is_answered = item.get("is_answered", False) if not title: continue snippet_parts = [] if tags: snippet_parts.append(f"Tags: {', '.join(tags)}") snippet_parts.append(f"Answered: {'yes' if is_answered else 'no'}") snippet_parts.append(f"Score: {score}") snippet = " | ".join(snippet_parts) relevance = compute_stack_relevance( title=title, tags=tags, snippet=snippet, message=message, error_message=error_message, language=language, framework=framework, score=score, is_answered=is_answered, ) if relevance < 2.0: continue evidence_list.append( RetrievedEvidence( source_type=SourceType.STACKOVERFLOW, title=title, snippet=snippet, url=link or None, score=relevance, ) ) evidence_list.sort(key=lambda x: x.score if x.score is not None else -1, reverse=True) return evidence_list[: (max_results or settings.MAX_STACK_RESULTS)]