File size: 5,145 Bytes
cd3b358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
Code indexer: parses repo structure and helps identify the most
relevant files for a given bug. No vector DB — pure in-memory.
"""
import logging
import re
from typing import List, Dict, Optional

from backend.config import CODE_EXTENSIONS, MAX_FILES_TO_ANALYZE

logger = logging.getLogger(__name__)


def build_file_tree_string(files: List[Dict], max_lines: int = 300) -> str:
    """
    Convert a flat list of file dicts into an indented tree string
    suitable for LLM context.
    """
    paths = sorted(f["path"] for f in files)

    lines = []
    prev_parts: List[str] = []

    for path in paths:
        parts = path.split("/")
        # Find the common prefix depth
        common = 0
        for i, (a, b) in enumerate(zip(prev_parts, parts[:-1])):
            if a == b:
                common = i + 1
            else:
                break

        # Print changed directory levels
        for depth in range(common, len(parts) - 1):
            indent = "  " * depth
            lines.append(f"{indent}📁 {parts[depth]}/")

        indent = "  " * (len(parts) - 1)
        lines.append(f"{indent}📄 {parts[-1]}")
        prev_parts = parts[:-1]

        if len(lines) >= max_lines:
            lines.append(f"... and more files ({len(paths) - paths.index(path) - 1} remaining)")
            break

    return "\n".join(lines)


def format_file_contents_for_prompt(
    file_contents: Dict[str, str],
    max_chars_per_file: int = 3000,
    max_total_chars: int = 20000,
) -> str:
    """
    Format multiple file contents into a single block for LLM context.
    Truncates long files and respects a total character budget.
    """
    sections = []
    total_chars = 0

    for path, content in file_contents.items():
        if total_chars >= max_total_chars:
            sections.append(f"[Remaining files omitted due to context limit]")
            break

        # Add line numbers for reference
        lines = content.splitlines()
        numbered = "\n".join(
            f"{i+1:4d} | {line}" for i, line in enumerate(lines)
        )

        if len(numbered) > max_chars_per_file:
            truncated = numbered[:max_chars_per_file]
            # Find a clean line boundary
            last_newline = truncated.rfind("\n")
            if last_newline > 0:
                truncated = truncated[:last_newline]
            numbered = truncated + f"\n\n... [TRUNCATED — {len(lines)} total lines, showing first {truncated.count(chr(10))} lines]"

        section = f"### File: `{path}`\n```\n{numbered}\n```"
        sections.append(section)
        total_chars += len(section)

    return "\n\n".join(sections)


def extract_file_paths_from_llm_response(response: str) -> List[str]:
    """
    Parse file paths from the LLM's relevance ranking response.
    Looks for backtick-quoted paths like `path/to/file.py` or **`path/to/file.py`**.
    """
    # Match paths in backticks
    patterns = [
        r"`([a-zA-Z0-9_\-./]+\.[a-zA-Z]+)`",   # `path/to/file.ext`
        r"\*\*`([a-zA-Z0-9_\-./]+\.[a-zA-Z]+)`\*\*",  # **`path`**
    ]
    paths = []
    for pattern in patterns:
        found = re.findall(pattern, response)
        for p in found:
            if p not in paths and "/" in p or "." in p:
                paths.append(p)

    return paths[:MAX_FILES_TO_ANALYZE]


def rank_files_by_keyword_match(
    files: List[Dict],
    keywords: List[str],
) -> List[Dict]:
    """
    Quick keyword-based pre-filter before sending the full list to the LLM.
    Returns files sorted by keyword match count (descending).
    """
    scored = []
    lc_keywords = [kw.lower() for kw in keywords]

    for f in files:
        path_lower = f["path"].lower()
        score = sum(kw in path_lower for kw in lc_keywords)
        scored.append((score, f))

    scored.sort(key=lambda x: -x[0])
    return [f for _, f in scored]


def extract_keywords_from_issue(issue_data: Dict) -> List[str]:
    """
    Extract potential code-relevant keywords from an issue dict.
    Used for pre-filtering before sending to LLM.
    """
    text = " ".join([
        issue_data.get("title", ""),
        issue_data.get("body", ""),
    ]).lower()

    # Extract likely identifiers: CamelCase, snake_case, module names
    words = re.findall(r"\b[a-zA-Z][a-zA-Z0-9_]{2,}\b", text)
    # Deduplicate while preserving order
    seen = set()
    keywords = []
    for w in words:
        lw = w.lower()
        if lw not in seen and len(lw) > 3:
            seen.add(lw)
            keywords.append(lw)

    return keywords[:30]


def get_file_summary(path: str, content: str, max_chars: int = 500) -> str:
    """
    Generate a quick summary of a file (first N chars of meaningful content).
    Skips blank lines and comment-only lines at the top.
    """
    lines = content.splitlines()
    meaningful = []
    for line in lines:
        stripped = line.strip()
        if stripped and not stripped.startswith("#") and not stripped.startswith("//"):
            meaningful.append(line)
        if len("\n".join(meaningful)) > max_chars:
            break
    preview = "\n".join(meaningful)[:max_chars]
    return preview