Spaces:

MCP-1st-Birthday
/

python_project_explainer

Sleeping

App Files Files Community

lafifi-24 commited on Nov 30, 2025

Commit

933c2fa

1 Parent(s): 3920f9c

i

Browse files

Files changed (9) hide show

app.py +131 -0
embedding_service.py +102 -0
github_repo_downloader.py +132 -0
graph_converter.py +41 -0
level_computer.py +57 -0
modal_client.py +40 -0
modal_functions.py +109 -0
prompt_generator.py +165 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from dotenv import load_dotenv
+load_dotenv(".env")
+import uuid
+import ast
+from llama_index.core.schema import TextNode
+from github_repo_downloader import GitHubRepoDownloader
+from pyan_insperation.analyzer import CallGraphVisitor
+from graph_converter import pyan_to_networkx
+from level_computer import compute_node_levels
+from prompt_generator import generate_explaination_by_level
+from embedding_service import EmbeddingService
+from structlog import get_logger
+logger = get_logger(__name__)
+import gradio as gr
+local_db = {}
+def ingest(repo_url, branch="main"):
+    """
+    Clone a GitHub repo, parse Python code, build code graph, index it.
+    Args:
+        repo_url: public GitHub repo URL.
+        branch: branch to index, defaults to "main".
+    Returns:
+        project_id: internal ID used to reference the indexed project.
+    """
+    repo = GitHubRepoDownloader(
+            repo_url=repo_url
+            , branch=branch
+        )
+    yield "repo downloaded"
+    files = repo.read_files(
+        file_filter=lambda path: path.endswith(".py")
+    )
+    yield "pyhton files loaded"
+    pyan_graph = CallGraphVisitor(files=files)
+    graph = pyan_to_networkx(pyan_graph=pyan_graph)
+    yield "graph builded"
+    levels = compute_node_levels(graph=graph)
+    yield "start generating explination"
+    prompts_by_level = generate_explaination_by_level(graph=graph, levels=levels)
+    yield " start embedding"
+    nodes = []
+    for node in graph.nodes:
+        if node.namespace is None or node.get_short_name() in ["lambda" ] or node.ast_node is None:
+            continue
+        if hasattr(node, "explination"):
+            nodes.append(TextNode(
+                text=node.explination,
+                metadata={
+                    "name": node.name,
+                    "filename": node.filename,
+                    "type": node.flavor.name,
+                    "namespace": node.namespace
+                },
+            ))
+        else:
+            nodes.append(TextNode(
+                text=ast.unparse(node.ast_node),
+                metadata={
+                    "name": node.name,
+                    "filename": node.filename,
+                    "type": node.flavor.name,
+                    "namespace": node.namespace
+                },
+            ))
+    embedding = EmbeddingService("test")
+    embedding.prepare_index(nodes)
+    project_id = uuid.uuid4().hex
+    local_db[project_id]=embedding
+    yield project_id
+def query(project_id, question, top_k=10):
+    """
+    Retrieve relevant nodes and send to reasoning LLM.
+    Args:
+        project_id: ID returned from ingest().
+        question: user question about the codebase.
+    Returns:
+        answer: generated explanation or context.
+    """
+    retrievers = local_db[project_id].infer(question, top_k=top_k)
+    yield retrievers
+demo = gr.TabbedInterface(
+    [
+        gr.Interface(
+            ingest,
+            [
+                gr.Textbox(label="repo_url"),
+                gr.Textbox(label="branch", value="main"),
+            ],
+            gr.Textbox(label="project_id"),
+        ),
+        gr.Interface(
+            query,
+            [
+                gr.Textbox(label="project_id"),
+                gr.Textbox(label="query"),
+                gr.Number(value =5, label="top_k", maximum=20, minimum=2),
+            ],
+            gr.Textbox(label="answer"),
+        ),
+    ],
+    [
+        "Ingest Repo",
+        "Query Project",
+    ],
+)
+demo.launch(mcp_server=True)

embedding_service.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import chromadb
+from llama_index.core import VectorStoreIndex
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.core import StorageContext, Settings
+from llama_index.core.schema import TextNode
+from openai import OpenAI
+from modal_client import ModalClient
+from structlog import get_logger
+logger = get_logger(__name__)
+from typing import Any, List
+from llama_index.core.bridge.pydantic import PrivateAttr
+from llama_index.core.embeddings import BaseEmbedding
+class CustomEmbeddings(BaseEmbedding):
+    def __init__(
+        self,
+        base_url:str,
+        api_key:str,
+        model_name: str ,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(**kwargs)
+        self._client = OpenAI(
+            base_url=base_url,
+            api_key=api_key
+        )
+        self.model_name = model_name
+    @classmethod
+    def class_name(cls) -> str:
+        return "custom"
+    async def _aget_query_embedding(self, query: str) -> List[float]:
+        return self._get_query_embedding(query)
+    async def _aget_text_embedding(self, text: str) -> List[float]:
+        return self._get_text_embedding(text)
+    def _get_query_embedding(self, query: str) -> List[float]:
+        embeddings = self._client.embeddings.create(
+            model=self.model_name,
+            input=[query]
+        ).data[0].embedding
+        return embeddings
+    def _get_text_embedding(self, text: str) -> List[float]:
+        embeddings = self._client.embeddings.create(
+            model=self.model_name,
+            input=[text]
+        ).data[0].embedding
+        return embeddings
+    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
+        embeddings_data = self._client.embeddings.create(
+            model=self.model_name,
+            input=texts
+        )
+        return [embedding.embedding for embedding in embeddings_data.data]
+class EmbeddingService:
+    def __init__(self, collection_name):
+        config = ModalClient.embedding_config()
+        Settings.embed_model = CustomEmbeddings(
+            api_key=config.get("api_key"),
+            base_url=config.get("base_url"),
+            model_name=config.get("model"),
+            embed_batch_size=32
+        )
+        Settings.chunk_size = 1024
+        chroma_client = chromadb.EphemeralClient()
+        chroma_collection = chroma_client.create_collection(collection_name)
+        vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
+        self.storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    def prepare_index(self,nodes):
+        self.index = VectorStoreIndex.from_documents(nodes, storage_context=self.storage_context)
+    def infer(self, query, top_k=10):
+        retriever = self.index.as_retriever(similarity_top_k=top_k)
+        results = retriever.retrieve(query)
+        text = ""
+        for result in results:
+            text += "\n -------------------------- \n"
+            text += f"name = {result.metadata['name']}\n"
+            text += f"filename = {result.metadata['filename']}\n"
+            text += f"type = {result.metadata['type']}\n"
+            text += f"namespace = {result.metadata['namespace']}\n"
+            text += f"content = {result.text}\n"
+        return text

github_repo_downloader.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import io
+import os
+from pathlib import Path
+from typing import Callable, Dict, Optional
+from urllib.parse import urlparse
+from structlog import get_logger
+import requests
+import zipfile
+logger = get_logger(__name__)
+class GitHubRepoDownloader:
+    def __init__(self, repo_url: str, branch: str = "main", cache_dir: str = ".cache"):
+        """
+        Initialize downloader with a GitHub repo URL.
+        Args:
+            repo_url: Full GitHub repo URL (e.g., https://github.com/owner/repo)
+            branch: Branch name to download (default: main)
+            cache_dir: Directory to cache downloaded files
+        """
+        self.owner, self.repo = self._parse_repo_url(repo_url)
+        self.branch = branch
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(exist_ok=True)
+        self._validate_branch()
+    def _parse_repo_url(self, repo_url: str) -> tuple[str, str]:
+        """Extract owner and repo name from GitHub URL"""
+        repo_url = repo_url.rstrip('/').replace('.git', '')
+        # Validate it's a GitHub URL
+        parsed = urlparse(repo_url)
+        if 'github.com' not in repo_url:
+            message = f"Not a GitHub URL: {repo_url}"
+            logger.error(message)
+            raise ValueError(message)
+        parts = repo_url.split('/')
+        if len(parts) < 2:
+            message = f"Invalid GitHub URL format: {repo_url}"
+            logger.error(message)
+            raise ValueError(message)
+        repo = parts[-1]
+        owner = parts[-2]
+        return owner, repo
+    def _validate_branch(self) -> None:
+        """Validate that the branch exists in the repository"""
+        url = f"https://api.github.com/repos/{self.owner}/{self.repo}/branches/{self.branch}"
+        logger.info(f"Validating branch: {self.branch}")
+        response = requests.get(url)
+        if response.status_code == 404:
+            message = f"Branch '{self.branch}' not found in {self.owner}/{self.repo}"
+            logger.error(message)
+            raise ValueError(message)
+        response.raise_for_status()
+    def _get_cache_path(self) -> Path:
+        """Get the cache file path for this repo"""
+        return self.cache_dir / f"{self.owner}_{self.repo}_{self.branch}.zip"
+    def _download_zip(self) -> Path:
+        """Download repo ZIP to cache"""
+        cache_path = self._get_cache_path()
+        # Return cached file if exists
+        if cache_path.exists():
+            logger.info(f"Using cached file: {cache_path}")
+            return cache_path
+        # Download ZIP
+        url = f"https://github.com/{self.owner}/{self.repo}/archive/refs/heads/{self.branch}.zip"
+        logger.info(f"Downloading {self.owner}/{self.repo} (branch: {self.branch})...")
+        response = requests.get(url)
+        response.raise_for_status()
+        with open(cache_path, 'wb') as f:
+            f.write(response.content)
+        logger.info(f"Saved to cache: {cache_path}")
+        return cache_path
+    def read_files(self, file_filter: Optional[Callable[[str], bool]] = None) -> Dict[str, str]:
+        """
+        Read files from the repo without extracting.
+        Args:
+            file_filter: Optional function to filter files (e.g., lambda path: path.endswith('.py'))
+        Returns:
+            Dictionary mapping file paths to their contents
+        """
+        cache_path = self._download_zip()
+        files_content = {}
+        with zipfile.ZipFile(cache_path) as zip_file:
+            for filename in zip_file.namelist():
+                if filename.endswith('/'):
+                    continue
+                # Remove root folder (format: repo-branch/path/to/file)
+                clean_path = '/'.join(filename.split('/')[1:])
+                if not clean_path:
+                    continue
+                # Apply filter
+                if file_filter and not file_filter(clean_path):
+                    continue
+                logger.info(f"Reading: {clean_path}")
+                try:
+                    with zip_file.open(filename) as file:
+                        content = file.read().decode('utf-8', errors='ignore')
+                        files_content[clean_path] = content
+                except Exception as e:
+                    logger.exception(f"⚠️  Error reading {clean_path}: {e}")
+        return files_content

graph_converter.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+Graph Converter Module
+Converts Pyan call graphs to NetworkX directed graphs for further processing.
+"""
+import networkx as nx
+from pyan_insperation.analyzer import CallGraphVisitor
+def pyan_to_networkx(pyan_graph: CallGraphVisitor) -> nx.DiGraph:
+    """
+    Convert a Pyan call graph to a NetworkX directed graph.
+    This function processes both defines_edges (containment relationships like
+    class-contains-method) and uses_edges (usage relationships like function-calls-function)
+    from the Pyan analyzer and creates a unified NetworkX graph with labeled edges.
+    Args:
+        pyan_graph: CallGraphVisitor instance with defines_edges and uses_edges
+    Returns:
+        nx.DiGraph with nodes and labeled edges ("contains" or "use")
+    """
+    graph = nx.DiGraph()
+    # Process defines_edges - containment relationships
+    for node in pyan_graph.defines_edges.keys():
+        graph.add_node(node)
+        for defined_node in pyan_graph.defines_edges[node]:
+            graph.add_node(defined_node)
+            graph.add_edge(node, defined_node, label="contains")
+    # Process uses_edges - usage relationships
+    for node in pyan_graph.uses_edges.keys():
+        graph.add_node(node)
+        for used_node in pyan_graph.uses_edges[node]:
+            graph.add_node(used_node)
+            graph.add_edge(node, used_node, label="use")
+    return graph

level_computer.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+Level Computer Module
+Computes dependency levels for graph nodes to enable efficient batching for LLM processing.
+"""
+import networkx as nx
+def compute_node_levels(graph: nx.DiGraph) -> dict:
+    """
+    Compute the level for each node based on successor depth.
+    Nodes are assigned levels based on their position in the dependency graph:
+    - Level 0: nodes with no successors (leaf nodes)
+    - Level N: 1 + max(successor levels)
+    This function handles cycles by condensing the graph into strongly connected
+    components before computing levels.
+    Args:
+        graph: NetworkX directed graph
+    Returns:
+        Dictionary mapping each node to its level (int)
+    """
+    # Condense the graph to handle strongly connected components (cycles)
+    C_graph = nx.condensation(graph)
+    scc_map = C_graph.graph['mapping']
+    levels = {}
+    def level(node):
+        """Recursively compute level with memoization."""
+        if node in levels:
+            return levels[node]
+        succ = list(C_graph.successors(node))
+        if not succ:  # No outgoing edges → level 0
+            levels[node] = 0
+        else:
+            levels[node] = 1 + max(level(s) for s in succ)
+        return levels[node]
+    # Compute levels for all nodes in condensed graph
+    for node in C_graph.nodes():
+        level(node)
+    node_to_level = {node: levels[scc_map[node]] for node in graph.nodes()}
+    level_to_node = {}
+    for node, level in node_to_level.items():
+        level_to_node.setdefault(level, []).append(node)
+    return level_to_node

modal_client.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+from openai import OpenAI
+from concurrent.futures import ThreadPoolExecutor
+# Define your models
+EXPLANATION_MODEL = "Qwen/Qwen3-4B-Instruct-2507"
+EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-8B"
+class ModalClient:
+    @staticmethod
+    def infer_llm(prompts: list[str], max_tokens: int = 800):
+        client = OpenAI(
+            base_url=os.environ.get("MODAL_URL_LLM_INFERENCE"),
+            api_key=os.environ.get('VLLM_API_KEY', 'not-needed')
+        )
+        def process_one(prompt):
+            response = client.chat.completions.create(
+                model=EXPLANATION_MODEL,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=max_tokens
+            )
+            return response.choices[0].message.content
+        with ThreadPoolExecutor(max_workers=32) as executor:
+            results = list(executor.map(process_one, prompts))
+        return results
+    @staticmethod
+    def embedding_config():
+        return {
+            "base_url":os.environ.get("MODAL_URL_LLM_EMBEDDING"),
+            "api_key":os.environ.get('VLLM_API_KEY', 'not-needed'),
+            "model":EMBEDDING_MODEL
+        }

modal_functions.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os
+import modal
+app = modal.App("code-understanding")
+import json
+from typing import Any
+import aiohttp
+vllm_image = (
+    modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
+    .entrypoint([])
+    .uv_pip_install(
+        "vllm==0.11.2",
+        "huggingface-hub==0.36.0",
+        "flashinfer-python==0.5.2",
+    )
+    .env({"HF_XET_HIGH_PERFORMANCE": "1"})  # faster model transfers
+)
+# Configuration
+EXPLANATION_MODEL = os.environ.get("EXPLANATION_MODEL", "Qwen/Qwen3-4B-Instruct-2507")
+EMBEDDING_MODEL = os.environ.get("EMBEDDING_MODEL", "Qwen/Qwen3-Embedding-8B")
+VLLM_PORT = 8000
+MINUTES = 60
+N_GPU=1
+FAST_BOOT=True
+@app.function(image=vllm_image,
+    gpu=f"A10:{N_GPU}",
+    scaledown_window=55 * MINUTES,  # how long should we stay up with no requests?
+    timeout=10 * MINUTES,  # how long should we wait for container start?
+    secrets=[modal.Secret.from_name("vllm-auth")]
+)
+@modal.concurrent(
+    max_inputs=32
+)
+@modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES)
+def explain_code_batch():
+    import subprocess
+    cmd = [
+        "vllm",
+        "serve",
+        "--uvicorn-log-level=info",
+        EXPLANATION_MODEL,
+        "--served-model-name",
+        EXPLANATION_MODEL,
+        "--host",
+        "0.0.0.0",
+        "--port",
+        str(VLLM_PORT),
+        "--max-model-len", "40000"
+    ]
+    cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
+    cmd += ["--tensor-parallel-size", str(N_GPU)]
+    print(cmd)
+    subprocess.Popen(" ".join(cmd), shell=True)
+@app.function(image=vllm_image,
+    gpu=f"A10:{N_GPU}",
+    scaledown_window=55 * MINUTES,
+    timeout=10 * MINUTES,
+    secrets=[modal.Secret.from_name("vllm-auth")])
+@modal.concurrent(
+    max_inputs=32
+)
+@modal.web_server(port=VLLM_PORT, startup_timeout=10 * MINUTES)
+def generate_embeddings_batch():
+    import subprocess
+    cmd = [
+        "vllm",
+        "serve",
+        "--uvicorn-log-level=info",
+        EMBEDDING_MODEL,
+        "--served-model-name",
+        EMBEDDING_MODEL,
+        "--host",
+        "0.0.0.0",
+        "--port",
+        str(VLLM_PORT),
+        "--task",
+        "embedding",
+        "--max-model-len", "40000"
+    ]
+    cmd += ["--enforce-eager" if FAST_BOOT else "--no-enforce-eager"]
+    cmd += ["--tensor-parallel-size", str(N_GPU)]
+    print(cmd)
+    subprocess.Popen(" ".join(cmd), shell=True)

prompt_generator.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""
+Prompt Generator Module
+Generates structured prompts for LLM-based code explanation organized by node level.
+"""
+import re
+import ast
+import copy
+import networkx as nx
+from structlog import get_logger
+from modal_client import ModalClient
+logger = get_logger(__name__)
+def generate_explaination_by_level(graph: nx.DiGraph, levels: dict) -> dict[int, dict]:
+    """
+    Generate LLM prompts organized by node level.
+    Creates prompts for each node that include:
+    - File path
+    - Used modules (name + content from graph successors)
+    - Node content (unparsed AST)
+    Nodes without a namespace are skipped as they typically represent
+    external or incomplete references.
+    Args:
+        graph: NetworkX directed graph with code nodes
+        levels: Dictionary mapping nodes to their levels
+    Returns:
+        Dictionary mapping level → {node: prompt_string}
+    """
+    # Generate prompts for each level
+    prompts_by_level = {}
+    for level in range(max(levels.keys()) + 1):
+        if level not in levels:
+            continue
+        batch = {}
+        for node in levels[level]:
+            if node.namespace is None or node.get_short_name() in ["lambda" ] or node.ast_node is None:
+                continue
+            if len(ast.unparse(node.ast_node))<1000:
+                continue
+            prompt = prompt = """You are a Python code analysis expert.
+**CRITICAL RULES:**
+1. ONLY use information directly visible in the "TARGET CODE" section
+2. For methods marked as "[SUMMARIZED]", reference them by their actual name shown
+3. If a method body is replaced with a summary, DO NOT invent details about its implementation
+4. State "implementation details not shown" for summarized methods
+Your explanation must be brief and cover:
+- Purpose: What this code does (1-2 sentences)
+- Inputs: Parameters (only those visible)
+- Outputs: Return values (only those visible)
+- Exceptions: Only exceptions explicitly raised in the visible code (1 sentence)
+"""
+            node_copy = copy.deepcopy(node)
+            # Extract used modules from graph successors
+            used_modules = []
+            summarized_methods = []
+            for used_node in graph.successors(node):
+                if used_node.namespace is None or used_node.get_short_name() in ["lambda" ] or used_node.ast_node is None:# this will ignore Python built-in functions
+                    continue
+                label = graph.get_edge_data(node, used_node).get("label")
+                # Only include "use" edges, skip "contains" edges
+                if used_node.ast_node is None:
+                    continue
+                elif label == 'contains':
+                    if used_node.ast_node in node.ast_node.body and\
+                        hasattr(used_node,"explination"):
+                        if isinstance(used_node.ast_node, ast.FunctionDef):
+                    # Keep function signature visible
+                            signature = f"def {used_node.ast_node.name}({ast.unparse(used_node.ast_node.args)})"
+                            if used_node.ast_node.returns:
+                                signature += f" -> {ast.unparse(used_node.ast_node.returns)}"
+                                marker_text = f"""[SUMMARIZED METHOD]
+Method: {used_node.name}
+Signature: {signature}
+Summary: {used_node.explination}
+Note: Full implementation replaced for brevity"""
+                        elif isinstance(used_node.ast_node, ast.ClassDef):
+                            marker_text = f"""[SUMMARIZED CLASS]
+Class: {used_node.name}
+Summary: {used_node.explination}
+Note: Full implementation replaced for brevity"""
+                        else:
+                            marker_text = f"""[SUMMARIZED]
+Name: {used_node.name}
+Summary: {used_node.explination}"""
+                        new_child = ast.Expr(value=ast.Constant(value=marker_text))
+                        for i, child in enumerate(node.ast_node.body):
+                            if child == used_node.ast_node:
+                                node_copy.ast_node.body[i] = new_child
+                                summarized_methods.append(used_node.name)
+                                break
+                        pass
+                    elif hasattr(used_node,"explination") is False:
+                        pass
+                elif label == 'use':
+                    used_modules.append(used_node)
+            # Build the prompt
+            prompt += f"**Target File Path:** {node.filename}\n\n"
+            logger.info(f"used modules numers {len(used_modules)}")
+            if used_modules:
+                if len(used_modules) > 20:
+                    pass
+                prompt += "**External Dependencies Used:**\n"
+                for used_node in used_modules:
+                    if hasattr(used_node, "explination"):
+                        prompt += f"""- **{used_node.name}** [EXPLAINED]
+    - File: {used_node.filename}
+    - Explanation: {used_node.explination}"""
+                    else:
+                        prompt += f"""- **{used_node.name}**
+    - File: {used_node.filename}
+    - Python Code: {ast.unparse(used_node.ast_node)}"""
+            if summarized_methods:
+                prompt += f"**Note:** The following methods are summarized in the code below: {', '.join(summarized_methods)}\n\n"
+            prompt += f"""**TARGET CODE:**
+```python
+{ast.unparse(node_copy.ast_node)}
+```
+Explain the TARGET CODE above and Brief and precise
+"""
+            batch[node] = prompt
+        if batch:
+            results = ModalClient.infer_llm(batch.values())
+            for index, node in enumerate(batch.keys()):
+                node.explination = results[index]
+    return prompts_by_level

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio[mcp]
+structlog
+requests
+networkx
+matplotlib
+modal
+openai
+python-dotenv
+llama-index-vector-stores-chroma
+llama-index