Spaces:
Running
Running
File size: 5,145 Bytes
cd3b358 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | """
Code indexer: parses repo structure and helps identify the most
relevant files for a given bug. No vector DB — pure in-memory.
"""
import logging
import re
from typing import List, Dict, Optional
from backend.config import CODE_EXTENSIONS, MAX_FILES_TO_ANALYZE
logger = logging.getLogger(__name__)
def build_file_tree_string(files: List[Dict], max_lines: int = 300) -> str:
"""
Convert a flat list of file dicts into an indented tree string
suitable for LLM context.
"""
paths = sorted(f["path"] for f in files)
lines = []
prev_parts: List[str] = []
for path in paths:
parts = path.split("/")
# Find the common prefix depth
common = 0
for i, (a, b) in enumerate(zip(prev_parts, parts[:-1])):
if a == b:
common = i + 1
else:
break
# Print changed directory levels
for depth in range(common, len(parts) - 1):
indent = " " * depth
lines.append(f"{indent}📁 {parts[depth]}/")
indent = " " * (len(parts) - 1)
lines.append(f"{indent}📄 {parts[-1]}")
prev_parts = parts[:-1]
if len(lines) >= max_lines:
lines.append(f"... and more files ({len(paths) - paths.index(path) - 1} remaining)")
break
return "\n".join(lines)
def format_file_contents_for_prompt(
file_contents: Dict[str, str],
max_chars_per_file: int = 3000,
max_total_chars: int = 20000,
) -> str:
"""
Format multiple file contents into a single block for LLM context.
Truncates long files and respects a total character budget.
"""
sections = []
total_chars = 0
for path, content in file_contents.items():
if total_chars >= max_total_chars:
sections.append(f"[Remaining files omitted due to context limit]")
break
# Add line numbers for reference
lines = content.splitlines()
numbered = "\n".join(
f"{i+1:4d} | {line}" for i, line in enumerate(lines)
)
if len(numbered) > max_chars_per_file:
truncated = numbered[:max_chars_per_file]
# Find a clean line boundary
last_newline = truncated.rfind("\n")
if last_newline > 0:
truncated = truncated[:last_newline]
numbered = truncated + f"\n\n... [TRUNCATED — {len(lines)} total lines, showing first {truncated.count(chr(10))} lines]"
section = f"### File: `{path}`\n```\n{numbered}\n```"
sections.append(section)
total_chars += len(section)
return "\n\n".join(sections)
def extract_file_paths_from_llm_response(response: str) -> List[str]:
"""
Parse file paths from the LLM's relevance ranking response.
Looks for backtick-quoted paths like `path/to/file.py` or **`path/to/file.py`**.
"""
# Match paths in backticks
patterns = [
r"`([a-zA-Z0-9_\-./]+\.[a-zA-Z]+)`", # `path/to/file.ext`
r"\*\*`([a-zA-Z0-9_\-./]+\.[a-zA-Z]+)`\*\*", # **`path`**
]
paths = []
for pattern in patterns:
found = re.findall(pattern, response)
for p in found:
if p not in paths and "/" in p or "." in p:
paths.append(p)
return paths[:MAX_FILES_TO_ANALYZE]
def rank_files_by_keyword_match(
files: List[Dict],
keywords: List[str],
) -> List[Dict]:
"""
Quick keyword-based pre-filter before sending the full list to the LLM.
Returns files sorted by keyword match count (descending).
"""
scored = []
lc_keywords = [kw.lower() for kw in keywords]
for f in files:
path_lower = f["path"].lower()
score = sum(kw in path_lower for kw in lc_keywords)
scored.append((score, f))
scored.sort(key=lambda x: -x[0])
return [f for _, f in scored]
def extract_keywords_from_issue(issue_data: Dict) -> List[str]:
"""
Extract potential code-relevant keywords from an issue dict.
Used for pre-filtering before sending to LLM.
"""
text = " ".join([
issue_data.get("title", ""),
issue_data.get("body", ""),
]).lower()
# Extract likely identifiers: CamelCase, snake_case, module names
words = re.findall(r"\b[a-zA-Z][a-zA-Z0-9_]{2,}\b", text)
# Deduplicate while preserving order
seen = set()
keywords = []
for w in words:
lw = w.lower()
if lw not in seen and len(lw) > 3:
seen.add(lw)
keywords.append(lw)
return keywords[:30]
def get_file_summary(path: str, content: str, max_chars: int = 500) -> str:
"""
Generate a quick summary of a file (first N chars of meaningful content).
Skips blank lines and comment-only lines at the top.
"""
lines = content.splitlines()
meaningful = []
for line in lines:
stripped = line.strip()
if stripped and not stripped.startswith("#") and not stripped.startswith("//"):
meaningful.append(line)
if len("\n".join(meaningful)) > max_chars:
break
preview = "\n".join(meaningful)[:max_chars]
return preview
|