Chat7-CodeX-Backend / stack_search.py
hashan-7's picture
update code
73a75a4 verified
import html
import re
import requests
from typing import List, Optional
from config import settings
from schemas import RetrievedEvidence, SourceType
STACK_API_BASE = "https://api.stackexchange.com/2.3/search/advanced"
STACK_SITE = "stackoverflow"
def clean_text(text: Optional[str]) -> str:
if not text:
return ""
text = html.unescape(str(text))
text = re.sub(r"<[^>]+>", " ", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
def tokenize(text: Optional[str]) -> List[str]:
cleaned = clean_text(text).lower()
return re.findall(r"[a-zA-Z_][a-zA-Z0-9_+#.-]*", cleaned)
def extract_error_keywords(error_message: Optional[str]) -> List[str]:
if not error_message:
return []
keywords = []
cleaned = clean_text(error_message)
exact_error_match = re.findall(r"[A-Za-z]+Error|[A-Za-z]+Exception", cleaned)
keywords.extend(exact_error_match)
tokens = tokenize(cleaned)
keywords.extend(tokens)
seen = set()
result = []
for item in keywords:
lower = item.lower()
if lower not in seen and len(lower) > 2:
seen.add(lower)
result.append(item)
return result[:8]
def extract_code_keywords(code: Optional[str]) -> List[str]:
if not code:
return []
interesting = []
patterns = [
r"\bdef\s+([A-Za-z_][A-Za-z0-9_]*)",
r"\bclass\s+([A-Za-z_][A-Za-z0-9_]*)",
r"\bimport\s+([A-Za-z_][A-Za-z0-9_.]*)",
r"\bfrom\s+([A-Za-z_][A-Za-z0-9_.]*)\s+import\b",
]
for pattern in patterns:
for match in re.findall(pattern, code):
interesting.append(match)
seen = set()
result = []
for item in interesting:
lower = item.lower()
if lower not in seen:
seen.add(lower)
result.append(item)
return result[:5]
def build_stack_query(
message: str,
error_message: Optional[str] = None,
language: Optional[str] = None,
framework: Optional[str] = None,
code: Optional[str] = None,
) -> str:
parts: List[str] = []
error_keywords = extract_error_keywords(error_message)
if error_keywords:
parts.append(f'"{error_keywords[0]}"')
if framework:
parts.append(clean_text(framework))
if language:
parts.append(clean_text(language))
code_keywords = extract_code_keywords(code)
parts.extend(code_keywords[:2])
message_tokens = tokenize(message)
important_message_tokens = [
token for token in message_tokens
if token.lower() not in {
"fix", "this", "code", "issue", "problem", "help", "please",
"python", "javascript", "java", "flutter", "react"
}
]
parts.extend(important_message_tokens[:3])
query = " ".join(part for part in parts if part)
return query.strip()
def compute_stack_relevance(
title: str,
tags: List[str],
snippet: str,
message: str,
error_message: Optional[str],
language: Optional[str],
framework: Optional[str],
score: int,
is_answered: bool,
) -> float:
title_l = clean_text(title).lower()
snippet_l = clean_text(snippet).lower()
tags_l = [clean_text(tag).lower() for tag in tags]
base = float(score if score is not None else 0)
relevance = 0.0
if is_answered:
relevance += 2.0
relevance += min(base, 10.0) * 0.4
if language and clean_text(language).lower() in title_l:
relevance += 3.0
if language and clean_text(language).lower() in tags_l:
relevance += 4.0
if framework and clean_text(framework).lower() in title_l:
relevance += 3.0
if framework and clean_text(framework).lower() in tags_l:
relevance += 4.0
error_keywords = extract_error_keywords(error_message)
for keyword in error_keywords[:4]:
k = keyword.lower()
if k in title_l:
relevance += 6.0
elif k in snippet_l:
relevance += 3.0
message_tokens = tokenize(message)
for token in message_tokens[:6]:
t = token.lower()
if len(t) < 4:
continue
if t in title_l:
relevance += 1.5
elif t in snippet_l:
relevance += 0.75
return relevance
def search_stackoverflow(
message: str,
error_message: Optional[str] = None,
language: Optional[str] = None,
framework: Optional[str] = None,
code: Optional[str] = None,
max_results: Optional[int] = None,
) -> List[RetrievedEvidence]:
query = build_stack_query(
message=message,
error_message=error_message,
language=language,
framework=framework,
code=code,
)
if not query:
return []
params = {
"order": "desc",
"sort": "relevance",
"q": query,
"site": STACK_SITE,
"pagesize": max((max_results or settings.MAX_STACK_RESULTS) * 2, 6),
"filter": "default",
}
if settings.STACKOVERFLOW_KEY:
params["key"] = settings.STACKOVERFLOW_KEY
try:
response = requests.get(
STACK_API_BASE,
params=params,
timeout=settings.SEARCH_TIMEOUT_SECONDS,
)
response.raise_for_status()
data = response.json()
except Exception as e:
print(f"Stack Overflow search failed: {e}")
return []
items = data.get("items", [])
evidence_list: List[RetrievedEvidence] = []
for item in items:
title = clean_text(item.get("title"))
link = clean_text(item.get("link"))
score = item.get("score", 0)
tags = item.get("tags", []) or []
is_answered = item.get("is_answered", False)
if not title:
continue
snippet_parts = []
if tags:
snippet_parts.append(f"Tags: {', '.join(tags)}")
snippet_parts.append(f"Answered: {'yes' if is_answered else 'no'}")
snippet_parts.append(f"Score: {score}")
snippet = " | ".join(snippet_parts)
relevance = compute_stack_relevance(
title=title,
tags=tags,
snippet=snippet,
message=message,
error_message=error_message,
language=language,
framework=framework,
score=score,
is_answered=is_answered,
)
if relevance < 2.0:
continue
evidence_list.append(
RetrievedEvidence(
source_type=SourceType.STACKOVERFLOW,
title=title,
snippet=snippet,
url=link or None,
score=relevance,
)
)
evidence_list.sort(key=lambda x: x.score if x.score is not None else -1, reverse=True)
return evidence_list[: (max_results or settings.MAX_STACK_RESULTS)]