""" Code indexer: parses repo structure and helps identify the most relevant files for a given bug. No vector DB — pure in-memory. """ import logging import re from typing import List, Dict, Optional from backend.config import CODE_EXTENSIONS, MAX_FILES_TO_ANALYZE logger = logging.getLogger(__name__) def build_file_tree_string(files: List[Dict], max_lines: int = 300) -> str: """ Convert a flat list of file dicts into an indented tree string suitable for LLM context. """ paths = sorted(f["path"] for f in files) lines = [] prev_parts: List[str] = [] for path in paths: parts = path.split("/") # Find the common prefix depth common = 0 for i, (a, b) in enumerate(zip(prev_parts, parts[:-1])): if a == b: common = i + 1 else: break # Print changed directory levels for depth in range(common, len(parts) - 1): indent = " " * depth lines.append(f"{indent}📁 {parts[depth]}/") indent = " " * (len(parts) - 1) lines.append(f"{indent}📄 {parts[-1]}") prev_parts = parts[:-1] if len(lines) >= max_lines: lines.append(f"... and more files ({len(paths) - paths.index(path) - 1} remaining)") break return "\n".join(lines) def format_file_contents_for_prompt( file_contents: Dict[str, str], max_chars_per_file: int = 3000, max_total_chars: int = 20000, ) -> str: """ Format multiple file contents into a single block for LLM context. Truncates long files and respects a total character budget. """ sections = [] total_chars = 0 for path, content in file_contents.items(): if total_chars >= max_total_chars: sections.append(f"[Remaining files omitted due to context limit]") break # Add line numbers for reference lines = content.splitlines() numbered = "\n".join( f"{i+1:4d} | {line}" for i, line in enumerate(lines) ) if len(numbered) > max_chars_per_file: truncated = numbered[:max_chars_per_file] # Find a clean line boundary last_newline = truncated.rfind("\n") if last_newline > 0: truncated = truncated[:last_newline] numbered = truncated + f"\n\n... [TRUNCATED — {len(lines)} total lines, showing first {truncated.count(chr(10))} lines]" section = f"### File: `{path}`\n```\n{numbered}\n```" sections.append(section) total_chars += len(section) return "\n\n".join(sections) def extract_file_paths_from_llm_response(response: str) -> List[str]: """ Parse file paths from the LLM's relevance ranking response. Looks for backtick-quoted paths like `path/to/file.py` or **`path/to/file.py`**. """ # Match paths in backticks patterns = [ r"`([a-zA-Z0-9_\-./]+\.[a-zA-Z]+)`", # `path/to/file.ext` r"\*\*`([a-zA-Z0-9_\-./]+\.[a-zA-Z]+)`\*\*", # **`path`** ] paths = [] for pattern in patterns: found = re.findall(pattern, response) for p in found: if p not in paths and "/" in p or "." in p: paths.append(p) return paths[:MAX_FILES_TO_ANALYZE] def rank_files_by_keyword_match( files: List[Dict], keywords: List[str], ) -> List[Dict]: """ Quick keyword-based pre-filter before sending the full list to the LLM. Returns files sorted by keyword match count (descending). """ scored = [] lc_keywords = [kw.lower() for kw in keywords] for f in files: path_lower = f["path"].lower() score = sum(kw in path_lower for kw in lc_keywords) scored.append((score, f)) scored.sort(key=lambda x: -x[0]) return [f for _, f in scored] def extract_keywords_from_issue(issue_data: Dict) -> List[str]: """ Extract potential code-relevant keywords from an issue dict. Used for pre-filtering before sending to LLM. """ text = " ".join([ issue_data.get("title", ""), issue_data.get("body", ""), ]).lower() # Extract likely identifiers: CamelCase, snake_case, module names words = re.findall(r"\b[a-zA-Z][a-zA-Z0-9_]{2,}\b", text) # Deduplicate while preserving order seen = set() keywords = [] for w in words: lw = w.lower() if lw not in seen and len(lw) > 3: seen.add(lw) keywords.append(lw) return keywords[:30] def get_file_summary(path: str, content: str, max_chars: int = 500) -> str: """ Generate a quick summary of a file (first N chars of meaningful content). Skips blank lines and comment-only lines at the top. """ lines = content.splitlines() meaningful = [] for line in lines: stripped = line.strip() if stripped and not stripped.startswith("#") and not stripped.startswith("//"): meaningful.append(line) if len("\n".join(meaningful)) > max_chars: break preview = "\n".join(meaningful)[:max_chars] return preview