lafifi-24 commited on
Commit
933c2fa
·
1 Parent(s): 3920f9c
app.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ load_dotenv(".env")
3
+
4
+ import uuid
5
+ import ast
6
+
7
+ from llama_index.core.schema import TextNode
8
+
9
+
10
+ from github_repo_downloader import GitHubRepoDownloader
11
+ from pyan_insperation.analyzer import CallGraphVisitor
12
+ from graph_converter import pyan_to_networkx
13
+ from level_computer import compute_node_levels
14
+ from prompt_generator import generate_explaination_by_level
15
+ from embedding_service import EmbeddingService
16
+
17
+
18
+ from structlog import get_logger
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ import gradio as gr
23
+
24
+ local_db = {}
25
+
26
+ def ingest(repo_url, branch="main"):
27
+ """
28
+ Clone a GitHub repo, parse Python code, build code graph, index it.
29
+
30
+ Args:
31
+ repo_url: public GitHub repo URL.
32
+ branch: branch to index, defaults to "main".
33
+
34
+ Returns:
35
+ project_id: internal ID used to reference the indexed project.
36
+ """
37
+ repo = GitHubRepoDownloader(
38
+ repo_url=repo_url
39
+ , branch=branch
40
+ )
41
+ yield "repo downloaded"
42
+
43
+ files = repo.read_files(
44
+ file_filter=lambda path: path.endswith(".py")
45
+ )
46
+ yield "pyhton files loaded"
47
+
48
+ pyan_graph = CallGraphVisitor(files=files)
49
+ graph = pyan_to_networkx(pyan_graph=pyan_graph)
50
+
51
+ yield "graph builded"
52
+
53
+
54
+ levels = compute_node_levels(graph=graph)
55
+ yield "start generating explination"
56
+ prompts_by_level = generate_explaination_by_level(graph=graph, levels=levels)
57
+ yield " start embedding"
58
+ nodes = []
59
+ for node in graph.nodes:
60
+ if node.namespace is None or node.get_short_name() in ["lambda" ] or node.ast_node is None:
61
+ continue
62
+ if hasattr(node, "explination"):
63
+ nodes.append(TextNode(
64
+ text=node.explination,
65
+ metadata={
66
+ "name": node.name,
67
+ "filename": node.filename,
68
+ "type": node.flavor.name,
69
+ "namespace": node.namespace
70
+ },
71
+ ))
72
+ else:
73
+ nodes.append(TextNode(
74
+ text=ast.unparse(node.ast_node),
75
+ metadata={
76
+ "name": node.name,
77
+ "filename": node.filename,
78
+ "type": node.flavor.name,
79
+ "namespace": node.namespace
80
+ },
81
+ ))
82
+ embedding = EmbeddingService("test")
83
+ embedding.prepare_index(nodes)
84
+ project_id = uuid.uuid4().hex
85
+ local_db[project_id]=embedding
86
+
87
+ yield project_id
88
+
89
+
90
+ def query(project_id, question, top_k=10):
91
+ """
92
+ Retrieve relevant nodes and send to reasoning LLM.
93
+
94
+ Args:
95
+ project_id: ID returned from ingest().
96
+ question: user question about the codebase.
97
+
98
+ Returns:
99
+ answer: generated explanation or context.
100
+ """
101
+ retrievers = local_db[project_id].infer(question, top_k=top_k)
102
+ yield retrievers
103
+
104
+
105
+ demo = gr.TabbedInterface(
106
+ [
107
+ gr.Interface(
108
+ ingest,
109
+ [
110
+ gr.Textbox(label="repo_url"),
111
+ gr.Textbox(label="branch", value="main"),
112
+ ],
113
+ gr.Textbox(label="project_id"),
114
+ ),
115
+ gr.Interface(
116
+ query,
117
+ [
118
+ gr.Textbox(label="project_id"),
119
+ gr.Textbox(label="query"),
120
+ gr.Number(value =5, label="top_k", maximum=20, minimum=2),
121
+ ],
122
+ gr.Textbox(label="answer"),
123
+ ),
124
+ ],
125
+ [
126
+ "Ingest Repo",
127
+ "Query Project",
128
+ ],
129
+ )
130
+
131
+ demo.launch(mcp_server=True)
embedding_service.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chromadb
2
+ from llama_index.core import VectorStoreIndex
3
+ from llama_index.vector_stores.chroma import ChromaVectorStore
4
+ from llama_index.core import StorageContext, Settings
5
+ from llama_index.core.schema import TextNode
6
+ from openai import OpenAI
7
+ from modal_client import ModalClient
8
+
9
+
10
+ from structlog import get_logger
11
+
12
+ logger = get_logger(__name__)
13
+
14
+ from typing import Any, List
15
+
16
+
17
+ from llama_index.core.bridge.pydantic import PrivateAttr
18
+ from llama_index.core.embeddings import BaseEmbedding
19
+
20
+
21
+ class CustomEmbeddings(BaseEmbedding):
22
+
23
+ def __init__(
24
+ self,
25
+ base_url:str,
26
+ api_key:str,
27
+ model_name: str ,
28
+ **kwargs: Any,
29
+ ) -> None:
30
+ super().__init__(**kwargs)
31
+ self._client = OpenAI(
32
+ base_url=base_url,
33
+ api_key=api_key
34
+ )
35
+ self.model_name = model_name
36
+
37
+
38
+ @classmethod
39
+ def class_name(cls) -> str:
40
+ return "custom"
41
+
42
+ async def _aget_query_embedding(self, query: str) -> List[float]:
43
+ return self._get_query_embedding(query)
44
+
45
+ async def _aget_text_embedding(self, text: str) -> List[float]:
46
+ return self._get_text_embedding(text)
47
+
48
+ def _get_query_embedding(self, query: str) -> List[float]:
49
+ embeddings = self._client.embeddings.create(
50
+ model=self.model_name,
51
+ input=[query]
52
+ ).data[0].embedding
53
+ return embeddings
54
+
55
+ def _get_text_embedding(self, text: str) -> List[float]:
56
+ embeddings = self._client.embeddings.create(
57
+ model=self.model_name,
58
+ input=[text]
59
+ ).data[0].embedding
60
+ return embeddings
61
+
62
+ def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
63
+ embeddings_data = self._client.embeddings.create(
64
+ model=self.model_name,
65
+ input=texts
66
+ )
67
+ return [embedding.embedding for embedding in embeddings_data.data]
68
+
69
+ class EmbeddingService:
70
+ def __init__(self, collection_name):
71
+ config = ModalClient.embedding_config()
72
+ Settings.embed_model = CustomEmbeddings(
73
+ api_key=config.get("api_key"),
74
+ base_url=config.get("base_url"),
75
+ model_name=config.get("model"),
76
+ embed_batch_size=32
77
+ )
78
+ Settings.chunk_size = 1024
79
+ chroma_client = chromadb.EphemeralClient()
80
+ chroma_collection = chroma_client.create_collection(collection_name)
81
+ vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
82
+ self.storage_context = StorageContext.from_defaults(vector_store=vector_store)
83
+
84
+ def prepare_index(self,nodes):
85
+
86
+ self.index = VectorStoreIndex.from_documents(nodes, storage_context=self.storage_context)
87
+
88
+
89
+ def infer(self, query, top_k=10):
90
+ retriever = self.index.as_retriever(similarity_top_k=top_k)
91
+ results = retriever.retrieve(query)
92
+ text = ""
93
+ for result in results:
94
+ text += "\n -------------------------- \n"
95
+ text += f"name = {result.metadata['name']}\n"
96
+ text += f"filename = {result.metadata['filename']}\n"
97
+ text += f"type = {result.metadata['type']}\n"
98
+ text += f"namespace = {result.metadata['namespace']}\n"
99
+ text += f"content = {result.text}\n"
100
+ return text
101
+
102
+
github_repo_downloader.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Callable, Dict, Optional
5
+ from urllib.parse import urlparse
6
+
7
+ from structlog import get_logger
8
+ import requests
9
+ import zipfile
10
+
11
+
12
+ logger = get_logger(__name__)
13
+
14
+ class GitHubRepoDownloader:
15
+ def __init__(self, repo_url: str, branch: str = "main", cache_dir: str = ".cache"):
16
+ """
17
+ Initialize downloader with a GitHub repo URL.
18
+
19
+ Args:
20
+ repo_url: Full GitHub repo URL (e.g., https://github.com/owner/repo)
21
+ branch: Branch name to download (default: main)
22
+ cache_dir: Directory to cache downloaded files
23
+ """
24
+ self.owner, self.repo = self._parse_repo_url(repo_url)
25
+ self.branch = branch
26
+ self.cache_dir = Path(cache_dir)
27
+ self.cache_dir.mkdir(exist_ok=True)
28
+ self._validate_branch()
29
+
30
+ def _parse_repo_url(self, repo_url: str) -> tuple[str, str]:
31
+ """Extract owner and repo name from GitHub URL"""
32
+
33
+ repo_url = repo_url.rstrip('/').replace('.git', '')
34
+
35
+ # Validate it's a GitHub URL
36
+ parsed = urlparse(repo_url)
37
+ if 'github.com' not in repo_url:
38
+ message = f"Not a GitHub URL: {repo_url}"
39
+ logger.error(message)
40
+ raise ValueError(message)
41
+
42
+ parts = repo_url.split('/')
43
+ if len(parts) < 2:
44
+ message = f"Invalid GitHub URL format: {repo_url}"
45
+ logger.error(message)
46
+ raise ValueError(message)
47
+
48
+ repo = parts[-1]
49
+ owner = parts[-2]
50
+
51
+ return owner, repo
52
+
53
+ def _validate_branch(self) -> None:
54
+ """Validate that the branch exists in the repository"""
55
+ url = f"https://api.github.com/repos/{self.owner}/{self.repo}/branches/{self.branch}"
56
+ logger.info(f"Validating branch: {self.branch}")
57
+
58
+ response = requests.get(url)
59
+ if response.status_code == 404:
60
+ message = f"Branch '{self.branch}' not found in {self.owner}/{self.repo}"
61
+ logger.error(message)
62
+ raise ValueError(message)
63
+ response.raise_for_status()
64
+
65
+ def _get_cache_path(self) -> Path:
66
+ """Get the cache file path for this repo"""
67
+ return self.cache_dir / f"{self.owner}_{self.repo}_{self.branch}.zip"
68
+
69
+ def _download_zip(self) -> Path:
70
+ """Download repo ZIP to cache"""
71
+ cache_path = self._get_cache_path()
72
+
73
+ # Return cached file if exists
74
+ if cache_path.exists():
75
+ logger.info(f"Using cached file: {cache_path}")
76
+ return cache_path
77
+
78
+ # Download ZIP
79
+ url = f"https://github.com/{self.owner}/{self.repo}/archive/refs/heads/{self.branch}.zip"
80
+ logger.info(f"Downloading {self.owner}/{self.repo} (branch: {self.branch})...")
81
+
82
+ response = requests.get(url)
83
+ response.raise_for_status()
84
+
85
+
86
+ with open(cache_path, 'wb') as f:
87
+ f.write(response.content)
88
+
89
+ logger.info(f"Saved to cache: {cache_path}")
90
+ return cache_path
91
+
92
+ def read_files(self, file_filter: Optional[Callable[[str], bool]] = None) -> Dict[str, str]:
93
+ """
94
+ Read files from the repo without extracting.
95
+
96
+ Args:
97
+ file_filter: Optional function to filter files (e.g., lambda path: path.endswith('.py'))
98
+
99
+
100
+ Returns:
101
+ Dictionary mapping file paths to their contents
102
+ """
103
+
104
+ cache_path = self._download_zip()
105
+
106
+ files_content = {}
107
+
108
+ with zipfile.ZipFile(cache_path) as zip_file:
109
+ for filename in zip_file.namelist():
110
+
111
+ if filename.endswith('/'):
112
+ continue
113
+
114
+ # Remove root folder (format: repo-branch/path/to/file)
115
+ clean_path = '/'.join(filename.split('/')[1:])
116
+ if not clean_path:
117
+ continue
118
+
119
+ # Apply filter
120
+ if file_filter and not file_filter(clean_path):
121
+ continue
122
+
123
+ logger.info(f"Reading: {clean_path}")
124
+
125
+ try:
126
+ with zip_file.open(filename) as file:
127
+ content = file.read().decode('utf-8', errors='ignore')
128
+ files_content[clean_path] = content
129
+ except Exception as e:
130
+ logger.exception(f"⚠️ Error reading {clean_path}: {e}")
131
+
132
+ return files_content
graph_converter.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Graph Converter Module
3
+
4
+ Converts Pyan call graphs to NetworkX directed graphs for further processing.
5
+ """
6
+
7
+ import networkx as nx
8
+ from pyan_insperation.analyzer import CallGraphVisitor
9
+
10
+
11
+ def pyan_to_networkx(pyan_graph: CallGraphVisitor) -> nx.DiGraph:
12
+ """
13
+ Convert a Pyan call graph to a NetworkX directed graph.
14
+
15
+ This function processes both defines_edges (containment relationships like
16
+ class-contains-method) and uses_edges (usage relationships like function-calls-function)
17
+ from the Pyan analyzer and creates a unified NetworkX graph with labeled edges.
18
+
19
+ Args:
20
+ pyan_graph: CallGraphVisitor instance with defines_edges and uses_edges
21
+
22
+ Returns:
23
+ nx.DiGraph with nodes and labeled edges ("contains" or "use")
24
+ """
25
+ graph = nx.DiGraph()
26
+
27
+ # Process defines_edges - containment relationships
28
+ for node in pyan_graph.defines_edges.keys():
29
+ graph.add_node(node)
30
+ for defined_node in pyan_graph.defines_edges[node]:
31
+ graph.add_node(defined_node)
32
+ graph.add_edge(node, defined_node, label="contains")
33
+
34
+ # Process uses_edges - usage relationships
35
+ for node in pyan_graph.uses_edges.keys():
36
+ graph.add_node(node)
37
+ for used_node in pyan_graph.uses_edges[node]:
38
+ graph.add_node(used_node)
39
+ graph.add_edge(node, used_node, label="use")
40
+
41
+ return graph
level_computer.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Level Computer Module
3
+
4
+ Computes dependency levels for graph nodes to enable efficient batching for LLM processing.
5
+ """
6
+
7
+ import networkx as nx
8
+
9
+
10
+ def compute_node_levels(graph: nx.DiGraph) -> dict:
11
+ """
12
+ Compute the level for each node based on successor depth.
13
+
14
+ Nodes are assigned levels based on their position in the dependency graph:
15
+ - Level 0: nodes with no successors (leaf nodes)
16
+ - Level N: 1 + max(successor levels)
17
+
18
+ This function handles cycles by condensing the graph into strongly connected
19
+ components before computing levels.
20
+
21
+ Args:
22
+ graph: NetworkX directed graph
23
+
24
+ Returns:
25
+ Dictionary mapping each node to its level (int)
26
+ """
27
+ # Condense the graph to handle strongly connected components (cycles)
28
+ C_graph = nx.condensation(graph)
29
+ scc_map = C_graph.graph['mapping']
30
+
31
+ levels = {}
32
+
33
+ def level(node):
34
+ """Recursively compute level with memoization."""
35
+ if node in levels:
36
+ return levels[node]
37
+
38
+ succ = list(C_graph.successors(node))
39
+
40
+ if not succ: # No outgoing edges → level 0
41
+ levels[node] = 0
42
+ else:
43
+ levels[node] = 1 + max(level(s) for s in succ)
44
+
45
+ return levels[node]
46
+
47
+ # Compute levels for all nodes in condensed graph
48
+ for node in C_graph.nodes():
49
+ level(node)
50
+
51
+
52
+ node_to_level = {node: levels[scc_map[node]] for node in graph.nodes()}
53
+
54
+ level_to_node = {}
55
+ for node, level in node_to_level.items():
56
+ level_to_node.setdefault(level, []).append(node)
57
+ return level_to_node
modal_client.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+ from concurrent.futures import ThreadPoolExecutor
4
+
5
+ # Define your models
6
+ EXPLANATION_MODEL = "Qwen/Qwen3-4B-Instruct-2507"
7
+ EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-8B"
8
+
9
+ class ModalClient:
10
+
11
+ @staticmethod
12
+ def infer_llm(prompts: list[str], max_tokens: int = 800):
13
+ client = OpenAI(
14
+ base_url=os.environ.get("MODAL_URL_LLM_INFERENCE"),
15
+ api_key=os.environ.get('VLLM_API_KEY', 'not-needed')
16
+ )
17
+
18
+ def process_one(prompt):
19
+
20
+ response = client.chat.completions.create(
21
+ model=EXPLANATION_MODEL,
22
+ messages=[{"role": "user", "content": prompt}],
23
+ max_tokens=max_tokens
24
+ )
25
+ return response.choices[0].message.content
26
+
27
+ with ThreadPoolExecutor(max_workers=32) as executor:
28
+ results = list(executor.map(process_one, prompts))
29
+
30
+ return results
31
+
32
+ @staticmethod
33
+ def embedding_config():
34
+
35
+
36
+ return {
37
+ "base_url":os.environ.get("MODAL_URL_LLM_EMBEDDING"),
38
+ "api_key":os.environ.get('VLLM_API_KEY', 'not-needed'),
39
+ "model":EMBEDDING_MODEL
40
+ }
modal_functions.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import modal
3
+
4
+
5
+ app = modal.App("code-understanding")
6
+
7
+ import json
8
+ from typing import Any
9
+
10
+ import aiohttp
11
+
12
+
13
+ vllm_image = (
14
+ modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
15
+ .entrypoint([])
16
+ .uv_pip_install(
17
+ "vllm==0.11.2",
18
+ "huggingface-hub==0.36.0",
19
+ "flashinfer-python==0.5.2",
20
+ )
21
+ .env({"HF_XET_HIGH_PERFORMANCE": "1"}) # faster model transfers
22
+ )
23
+
24
+
25
+ # Configuration
26
+ EXPLANATION_MODEL = os.environ.get("EXPLANATION_MODEL", "Qwen/Qwen3-4B-Instruct-2507")
27
+ EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-8B")
28
+ VLLM_PORT = 8000
29
+ MINUTES = 60
30
+ N_GPU=1
31
+ FAST_BOOT=True
32
+
33
+ @app.function(image=vllm_image,
34
+ gpu=f"A10:{N_GPU}",
35
+ scaledown_window=55 * MINUTES, # how long should we stay up with no requests?
36
+ timeout=10 * MINUTES, # how long should we wait for container start?
37
+ secrets=[modal.Secret.from_name("vllm-auth")]
38
+ )
39
+ @modal.concurrent(
40
+ max_inputs=32
41
+ )
42
+ @modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES)
43
+ def explain_code_batch():
44
+ import subprocess
45
+
46
+ cmd = [
47
+ "vllm",
48
+ "serve",
49
+ "--uvicorn-log-level=info",
50
+ EXPLANATION_MODEL,
51
+ "--served-model-name",
52
+ EXPLANATION_MODEL,
53
+
54
+ "--host",
55
+ "0.0.0.0",
56
+ "--port",
57
+ str(VLLM_PORT),
58
+ "--max-model-len", "40000"
59
+ ]
60
+
61
+
62
+ cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
63
+
64
+
65
+ cmd += ["--tensor-parallel-size", str(N_GPU)]
66
+
67
+ print(cmd)
68
+
69
+ subprocess.Popen(" ".join(cmd), shell=True)
70
+
71
+
72
+ @app.function(image=vllm_image,
73
+ gpu=f"A10:{N_GPU}",
74
+ scaledown_window=55 * MINUTES,
75
+ timeout=10 * MINUTES,
76
+ secrets=[modal.Secret.from_name("vllm-auth")])
77
+ @modal.concurrent(
78
+ max_inputs=32
79
+ )
80
+ @modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES)
81
+ def generate_embeddings_batch():
82
+ import subprocess
83
+
84
+ cmd = [
85
+ "vllm",
86
+ "serve",
87
+ "--uvicorn-log-level=info",
88
+ EMBEDDING_MODEL,
89
+ "--served-model-name",
90
+ EMBEDDING_MODEL,
91
+
92
+ "--host",
93
+ "0.0.0.0",
94
+ "--port",
95
+ str(VLLM_PORT),
96
+ "--task",
97
+ "embedding",
98
+ "--max-model-len", "40000"
99
+ ]
100
+
101
+
102
+ cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
103
+
104
+
105
+ cmd += ["--tensor-parallel-size", str(N_GPU)]
106
+
107
+ print(cmd)
108
+
109
+ subprocess.Popen(" ".join(cmd), shell=True)
prompt_generator.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prompt Generator Module
3
+
4
+ Generates structured prompts for LLM-based code explanation organized by node level.
5
+ """
6
+ import re
7
+ import ast
8
+ import copy
9
+ import networkx as nx
10
+ from structlog import get_logger
11
+ from modal_client import ModalClient
12
+
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+
18
+ def generate_explaination_by_level(graph: nx.DiGraph, levels: dict) -> dict[int, dict]:
19
+ """
20
+ Generate LLM prompts organized by node level.
21
+
22
+ Creates prompts for each node that include:
23
+ - File path
24
+ - Used modules (name + content from graph successors)
25
+ - Node content (unparsed AST)
26
+
27
+ Nodes without a namespace are skipped as they typically represent
28
+ external or incomplete references.
29
+
30
+ Args:
31
+ graph: NetworkX directed graph with code nodes
32
+ levels: Dictionary mapping nodes to their levels
33
+
34
+ Returns:
35
+ Dictionary mapping level → {node: prompt_string}
36
+ """
37
+
38
+
39
+ # Generate prompts for each level
40
+ prompts_by_level = {}
41
+
42
+ for level in range(max(levels.keys()) + 1):
43
+ if level not in levels:
44
+ continue
45
+
46
+ batch = {}
47
+
48
+ for node in levels[level]:
49
+ if node.namespace is None or node.get_short_name() in ["lambda" ] or node.ast_node is None:
50
+ continue
51
+
52
+ if len(ast.unparse(node.ast_node))<1000:
53
+
54
+ continue
55
+ prompt = prompt = """You are a Python code analysis expert.
56
+
57
+ **CRITICAL RULES:**
58
+ 1. ONLY use information directly visible in the "TARGET CODE" section
59
+ 2. For methods marked as "[SUMMARIZED]", reference them by their actual name shown
60
+ 3. If a method body is replaced with a summary, DO NOT invent details about its implementation
61
+ 4. State "implementation details not shown" for summarized methods
62
+
63
+ Your explanation must be brief and cover:
64
+ - Purpose: What this code does (1-2 sentences)
65
+ - Inputs: Parameters (only those visible)
66
+ - Outputs: Return values (only those visible)
67
+ - Exceptions: Only exceptions explicitly raised in the visible code (1 sentence)
68
+
69
+ """
70
+ node_copy = copy.deepcopy(node)
71
+
72
+ # Extract used modules from graph successors
73
+ used_modules = []
74
+ summarized_methods = []
75
+ for used_node in graph.successors(node):
76
+ if used_node.namespace is None or used_node.get_short_name() in ["lambda" ] or used_node.ast_node is None:# this will ignore Python built-in functions
77
+ continue
78
+
79
+ label = graph.get_edge_data(node, used_node).get("label")
80
+
81
+ # Only include "use" edges, skip "contains" edges
82
+ if used_node.ast_node is None:
83
+ continue
84
+ elif label == 'contains':
85
+ if used_node.ast_node in node.ast_node.body and\
86
+ hasattr(used_node,"explination"):
87
+ if isinstance(used_node.ast_node, ast.FunctionDef):
88
+ # Keep function signature visible
89
+ signature = f"def {used_node.ast_node.name}({ast.unparse(used_node.ast_node.args)})"
90
+ if used_node.ast_node.returns:
91
+ signature += f" -> {ast.unparse(used_node.ast_node.returns)}"
92
+
93
+ marker_text = f"""[SUMMARIZED METHOD]
94
+ Method: {used_node.name}
95
+ Signature: {signature}
96
+ Summary: {used_node.explination}
97
+ Note: Full implementation replaced for brevity"""
98
+
99
+ elif isinstance(used_node.ast_node, ast.ClassDef):
100
+ marker_text = f"""[SUMMARIZED CLASS]
101
+ Class: {used_node.name}
102
+ Summary: {used_node.explination}
103
+ Note: Full implementation replaced for brevity"""
104
+
105
+ else:
106
+ marker_text = f"""[SUMMARIZED]
107
+ Name: {used_node.name}
108
+ Summary: {used_node.explination}"""
109
+ new_child = ast.Expr(value=ast.Constant(value=marker_text))
110
+ for i, child in enumerate(node.ast_node.body):
111
+ if child == used_node.ast_node:
112
+ node_copy.ast_node.body[i] = new_child
113
+ summarized_methods.append(used_node.name)
114
+ break
115
+ pass
116
+ elif hasattr(used_node,"explination") is False:
117
+ pass
118
+
119
+
120
+
121
+ elif label == 'use':
122
+
123
+ used_modules.append(used_node)
124
+
125
+ # Build the prompt
126
+ prompt += f"**Target File Path:** {node.filename}\n\n"
127
+ logger.info(f"used modules numers {len(used_modules)}")
128
+ if used_modules:
129
+ if len(used_modules) > 20:
130
+ pass
131
+ prompt += "**External Dependencies Used:**\n"
132
+ for used_node in used_modules:
133
+ if hasattr(used_node, "explination"):
134
+ prompt += f"""- **{used_node.name}** [EXPLAINED]
135
+ - File: {used_node.filename}
136
+ - Explanation: {used_node.explination}"""
137
+ else:
138
+ prompt += f"""- **{used_node.name}**
139
+ - File: {used_node.filename}
140
+ - Python Code: {ast.unparse(used_node.ast_node)}"""
141
+
142
+
143
+
144
+
145
+
146
+
147
+ if summarized_methods:
148
+ prompt += f"**Note:** The following methods are summarized in the code below: {', '.join(summarized_methods)}\n\n"
149
+
150
+
151
+ prompt += f"""**TARGET CODE:**
152
+ ```python
153
+ {ast.unparse(node_copy.ast_node)}
154
+ ```
155
+
156
+ Explain the TARGET CODE above and Brief and precise
157
+ """
158
+
159
+ batch[node] = prompt
160
+
161
+ if batch:
162
+ results = ModalClient.infer_llm(batch.values())
163
+ for index, node in enumerate(batch.keys()):
164
+ node.explination = results[index]
165
+ return prompts_by_level
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio[mcp]
2
+ structlog
3
+ requests
4
+ networkx
5
+ matplotlib
6
+ modal
7
+ openai
8
+ python-dotenv
9
+ llama-index-vector-stores-chroma
10
+ llama-index
11
+