Spaces:

Krishna324
/

WhatRepo

Sleeping

App Files Files Community

Krishna172912 commited on Apr 22

Commit

8ddfaad

unverified ·

1 Parent(s): 170ed9f

Add files via upload

Browse files

Files changed (2) hide show

back_end/agent/graph.py +193 -0
back_end/agent/tools.py +334 -0

back_end/agent/graph.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import tiktoken
+from langchain_core.messages import trim_messages,HumanMessage, AIMessage
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langgraph.graph import MessagesState,StateGraph, START, END, MessagesState
+from langgraph.prebuilt import ToolNode, tools_condition
+from pydantic import BaseModel, Field
+from typing import Literal
+import json
+from pathlib import Path
+from langchain_core.documents import Document
+from agent.tools import get_code_search_tools
+from config import (
+    SUPERVISOR_SYSTEM_PROMPT,
+    AGENT_SYSTEM_PROMPT_HEADER,
+    AGENT_SYSTEM_PROMPT_TOOLS,
+    AGENT_SYSTEM_PROMPT_TOOLS_NO_DB,
+    AGENT_SYSTEM_PROMPT_FOOTER,
+)
+enc = tiktoken.get_encoding("cl100k_base")
+def _tiktoken_counter(messages):
+    total = 0
+    for m in messages:
+        text_to_encode = ""
+        # 1. Extract content and tool_calls safely
+        if isinstance(m, dict):
+            content = m.get("content", "")
+            tool_calls = m.get("tool_calls", [])
+        else:
+            content = getattr(m, "content", "")
+            tool_calls = getattr(m, "tool_calls", [])
+        # 2. Handle string or list content
+        if isinstance(content, list):
+            text_to_encode += str(content)
+        else:
+            text_to_encode += str(content)
+        # 3. CRITICAL: Catch tool calls so they don't bypass the counter
+        if tool_calls:
+            text_to_encode += json.dumps(tool_calls)
+        # Encode and count
+        total += len(enc.encode(text_to_encode))
+    return total
+# ---------------------------------------------------------
+# 1. AGENT NODE
+# ---------------------------------------------------------
+def initialize_agent(is_vector_db_created: bool, tools: list):
+    # llm = ChatGoogleGenerativeAI( model="gemini-3.1-flash-lite-preview",temperature=0 )
+    llm = ChatGoogleGenerativeAI( model="gemma-4-31b-it",temperature=0 )
+    llm_with_tools = llm.bind_tools(tools)
+    message_trimmer = trim_messages(
+        max_tokens=200000,
+        strategy="last",
+        token_counter=_tiktoken_counter, # We Use the Gemini model's specific token counter but it will make http request which will take too long so just just tiktoken wich will be good enough
+        include_system=True, # NEVER delete the system prompt/repo map
+        allow_partial=False # Don't chop a message in half
+    )
+    # Call the model to generate a response based on the current state.
+    # Given the question, it will decide to retrieve using the retriever tool, or simply respond to the user.
+    def generate_query_or_respond(state: MessagesState):
+        if is_vector_db_created:
+            system_prompt = f"{AGENT_SYSTEM_PROMPT_HEADER}\n\n{AGENT_SYSTEM_PROMPT_TOOLS}\n\n{AGENT_SYSTEM_PROMPT_FOOTER}"
+        else:
+            system_prompt = f"{AGENT_SYSTEM_PROMPT_HEADER}\n\n{AGENT_SYSTEM_PROMPT_TOOLS_NO_DB}\n\n{AGENT_SYSTEM_PROMPT_FOOTER}"
+        # 1. Inject the system prompt into the message history
+        messages_to_evaluate = [{"role": "system", "content": system_prompt}] + state["messages"]
+        # 2. to save context window,or not to runout of tokens we trim the context from past which in above max limit that we
+        trimmed_messages = message_trimmer.invoke(messages_to_evaluate)
+        # 3. Generate the response (PASS IN THE TRIMMED MESSAGES)
+        response = llm_with_tools.invoke(trimmed_messages)
+        return {"messages": [response]}
+    return generate_query_or_respond
+# ---------------------------------------------------------
+# 2. THE LEAD ARCHITECT (SUPERVISOR NODE)
+# ---------------------------------------------------------
+# 1. Define the decision schema
+class SupervisorDecision(BaseModel):
+    reasoning: str = Field(
+        description="1. What did the user ask? 2. What raw data is in the tool outputs? 3. Is the raw data sufficient to answer the user?"
+    )
+    status: Literal["ACCEPT", "REJECT"] = Field(
+        description="ACCEPT if the RAW TOOL OUTPUTS contain enough info to answer the user. REJECT if the agent needs to search for more specific files."
+    )
+    content: str = Field(
+        description="If ACCEPT: Write the final, exhaustive response to the user. If REJECT: Write targeted instructions telling the agent what to search for next."
+    )
+def initialize_supervisor():
+    powerful_llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2,max_output_tokens=65536)
+    powerful_agent = powerful_llm.with_structured_output(SupervisorDecision)
+    def supervisor_node(state: MessagesState):
+        # Calculate iteration count based on previous feedback messages
+        iteration_count = sum(
+            1 for m in state["messages"]
+            if isinstance(m, HumanMessage) and "SUPERVISOR FEEDBACK:" in m.content
+        )
+        system_prompt =  SUPERVISOR_SYSTEM_PROMPT
+        # STRUCTURAL SAFEGUARD: Force accept after 2 rejections
+        if iteration_count >= 2:
+            system_prompt += """
+            \n\n*** CRITICAL OVERRIDE ***
+            You have rejected the researcher 2 times. You MUST now output status="ACCEPT" and synthesize the best possible final answer from ALL available evidence, explicitly noting what is implicit vs explicit. DO NOT REJECT.
+            """
+        messages_to_evaluate = [{"role": "system", "content": system_prompt}] + state["messages"]
+        decision = powerful_agent.invoke(messages_to_evaluate)
+        if decision.status == "ACCEPT":
+            return {"messages": [AIMessage(content=decision.content)]}
+        else:
+            return {"messages": [HumanMessage(content=f"SUPERVISOR FEEDBACK: {decision.content}")]}
+    return supervisor_node
+# --- Custom Router for the Supervisor ---
+def route_supervisor(state: MessagesState):
+    last_message = state["messages"][-1]
+    # If the supervisor returned an AIMessage, it ACCEPTED the work. We are done.
+    if isinstance(last_message, AIMessage):
+        return END
+    # If it returned a HumanMessage, it REJECTED the work. Send back to the researcher.
+    return "agent"
+def build_workflow(
+        repo_storage: Path,
+        is_vector_db_created: bool,
+        all_splits: list[Document] = None,
+        vector_db = None
+    ):
+    tools = get_code_search_tools(repo_storage,is_vector_db_created,all_splits,vector_db)
+    agent_node = initialize_agent(is_vector_db_created,tools)
+    supervisor_node = initialize_supervisor()
+    # --- Building the Graph ---
+    workflow = StateGraph(MessagesState)
+    # --- Add our nodes to the graph ---
+    # Set the entry point: Start by calling the agent
+    workflow.add_edge(START, "agent")
+    workflow.add_node("agent", agent_node)
+    workflow.add_node("tools", ToolNode(tools))
+    workflow.add_node("supervisor",supervisor_node)
+    # --- Routing ---
+    # After the 'agent' node runs, check the output.
+    # tools_condition automatically checks: Did the agent output a tool_call?
+    # - If YES: route to the "tools" node.
+    # - If NO: route to END.
+    workflow.add_conditional_edges(
+        "agent",
+        tools_condition,
+        {
+            "tools": "tools",       # If tool call, go to tools
+            END: "supervisor"       # (CHANGED) If done with tools, go to supervisor instead of END
+        }
+    )
+    # After the tools finish executing, ALWAYS route back to the agent.
+    # The agent needs to read the tool output and decide what to do next.
+    workflow.add_edge("tools", "agent")
+    workflow.add_conditional_edges("supervisor", route_supervisor, { "agent":"agent",END : END })
+    # --- Compile ---
+    return workflow.compile()

back_end/agent/tools.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import os
+import fnmatch
+import itertools
+from pathlib import Path
+from langchain.tools import tool
+from langchain_community.retrievers import BM25Retriever
+from langchain_core.documents import Document
+from langchain_core.tools import BaseTool
+from config import EXCLUDE_PATTERNS
+def get_code_search_tools(
+    repo_storage: Path,
+    is_vector_db_created: bool,
+    all_splits: list[Document] = None,
+    vector_db = None
+)-> list[BaseTool]:
+    # Initialize BM25 only if we have vector data
+    bm25_retriever = None
+    if is_vector_db_created and all_splits:
+        bm25_retriever = BM25Retriever.from_documents(all_splits, k=10)
+    @tool
+    def exact_code_search(search_pattern: str) -> str:
+        """
+        Search the codebase for an exact literal string.
+        Use this tool FIRST when looking for exact function definitions, variable usages,
+        specific syntax, or known class names.
+        Input should be the exact string you want to find. (Note: Regex is NOT supported).
+        """
+        try:
+            base_path = repo_storage.resolve()
+            MAX_LINES = 350
+            matches = []
+            # 1. Updated validation function using your global EXCLUDE_PATTERNS
+            def is_valid_file(p: Path) -> bool:
+                # Skip non-files and symlinks
+                if p.is_symlink() or not p.is_file():
+                    return False
+                # Convert path to string with forward slashes for consistent glob matching
+                path_str = p.as_posix()
+                # Check against global patterns
+                for pattern in EXCLUDE_PATTERNS:
+                    if fnmatch.fnmatch(path_str, pattern):
+                        return False
+                return True
+            # 2. The combined search logic
+            for file_path in base_path.rglob("*"):
+                if not is_valid_file(file_path):
+                    continue
+                try:
+                    rel_path = file_path.relative_to(repo_storage).as_posix()
+                    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                        for i, line in enumerate(f, 1):
+                            if search_pattern in line:
+                                matches.append(f"{rel_path}:{i}:{line.strip()}")
+                                if len(matches) >= MAX_LINES:
+                                    break
+                except Exception:
+                    continue
+                if len(matches) >= MAX_LINES:
+                    break
+            # 3. Format output
+            if not matches:
+                return f"No exact matches found for '{search_pattern}'."
+            output = "\n".join(matches)
+            if len(matches) >= MAX_LINES:
+                return f"--- EXACT MATCHES (first {MAX_LINES}) ---\n{output}\n\n... (Output truncated to save context)"
+            else:
+                return f"--- EXACT MATCHES ---\n{output}"
+        except Exception as e:
+            return f"Search error: {str(e)}"
+    # -----------------------------------------------------------------------------
+    # Tool 2: Retrival using BM25
+    # -----------------------------------------------------------------------------
+    @tool
+    def keyword_code_search(query: str, k: int = 5) -> str:
+        """
+        Search the codebase using exact keyword matching (BM25).
+        Use this tool when looking for files containing specific keywords, error messages,
+        or terminology where exact syntax matching isn't strictly required but specific words are important.
+        Input should be a set of relevant keywords and the number of chunks (k) to return.
+        """
+        try:
+            # Update k dynamically so the agent can control how much context it gets
+            bm25_retriever.k = k
+            docs = bm25_retriever.invoke(query)
+            if not docs:
+                return f"No keyword matches found for '{query}'."
+            formatted_chunks = []
+            for doc in docs:
+                source_file = doc.metadata.get("source", "Unknown File")
+                formatted_chunks.append(f"--- File_Source: {source_file} ---\n{doc.page_content}")
+            return "\n\n".join(formatted_chunks)
+        except Exception as e:
+            return f"Keyword search error: {str(e)}"
+    # -----------------------------------------------------------------------------
+    # Tool 3: Simple retrival from vectordb based on cosine sililarity
+    # -----------------------------------------------------------------------------
+    @tool
+    def semantic_code_search(query: str, k: int = 5) -> str:
+        """
+        Search the codebase using semantic vector embeddings.
+        Use this tool to understand concepts, architecture, or ask natural language questions
+        like "how does the database connection work?" or "where is the staging logic?"
+        Do NOT use this for exact variable lookups or specific function signatures.
+        Input should be a natural language query and the number of chunks (k) to return.
+        """
+        try:
+            # Create a dynamic retriever on the fly to inject the agent's requested 'k'
+            # Adjust search_type to "similarity" or "similarity_score_threshold" based on your DB setup
+            temp_dense_retriever = vector_db.as_retriever(
+                search_type="similarity",
+                search_kwargs={"k": k}
+            )
+            docs = temp_dense_retriever.invoke(query)
+            if not docs:
+                return f"No semantic matches found for '{query}'."
+            formatted_chunks = []
+            for doc in docs:
+                source_file = doc.metadata.get("source", "Unknown File")
+                formatted_chunks.append(f"--- File_Source: {source_file} ---\n{doc.page_content}")
+            return "\n\n".join(formatted_chunks)
+        except Exception as e:
+            return f"Semantic search error: {str(e)}"
+    # -----------------------------------------------------------------------------
+    # Tool 4: get contents of a specified file
+    # -----------------------------------------------------------------------------
+    @tool
+    def get_specific_file(file_path: str, start_line: int = None, end_line: int = None) -> str:
+        """
+        Get the text contents of a specific file from the repository.
+        - If start_line and end_line are NOT provided, it returns the entire file (up to 50,000 bytes).
+        - If start_line and end_line ARE provided (1-indexed), it returns only those specific lines, bypassing the file size limit.
+        Use this tool to read entire small files, or to paginate through massive files by requesting specific line ranges.
+        Input should be the exact file path, and optionally the start and end line numbers.
+        """
+        try:
+            clean_path = file_path.lstrip('/')
+            target_path = (repo_storage / clean_path).resolve()
+            # 1. Security Check: Prevent path traversal
+            if not target_path.is_relative_to(repo_storage):
+                return "Error: Access denied. You cannot read files outside the repository root."
+            absolute_file_path = str(target_path)
+            # ---------------------------------------------------------
+            # MODE 1: Specific Line Range Requested
+            # ---------------------------------------------------------
+            if start_line is not None or end_line is not None:
+                # Handle cases where the LLM provides one but not the other
+                start_line = start_line if start_line is not None else 1
+                end_line = end_line if end_line is not None else (start_line + 300)
+                # Sanity checks for the agent
+                if start_line < 1:
+                    return "Error: start_line must be >= 1."
+                if end_line < start_line:
+                    return "Error: end_line must be >= start_line."
+                # Protect context window: limit the maximum lines requested at once
+                MAX_LINES_TO_READ = 500
+                if (end_line - start_line + 1) > MAX_LINES_TO_READ:
+                    return f"Error: You can only request up to {MAX_LINES_TO_READ} lines at a time to save context space."
+                try:
+                    # Use itertools.islice to lazily read only the needed lines without loading the whole file into RAM
+                    with open(absolute_file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                        # islice is 0-indexed, so we subtract 1 from start_line. end_line is exclusive.
+                        lines = list(itertools.islice(f, start_line - 1, end_line))
+                    if not lines:
+                        return f"Error: The requested lines ({start_line}-{end_line}) are out of bounds for this file."
+                    content = "".join(lines)
+                    return f"--- File_Source: {file_path} (Lines {start_line}-{end_line}) ---\n{content}"
+                except Exception as e:
+                    return f"Error reading specific lines from {file_path}: {str(e)}"
+            # ---------------------------------------------------------
+            # MODE 2: Entire File Requested
+            # ---------------------------------------------------------
+            else:
+                # Check file size using the ABSOLUTE path
+                file_size = os.path.getsize(absolute_file_path)
+                # Rough estimation: 1 byte is roughly 1 character in standard encoding
+                if file_size > 50000:
+                    return (f"Error: The file '{file_path}' is too large ({file_size} bytes) to load entirely. "
+                            f"Please use this tool again and provide `start_line` and `end_line` parameters to read specific sections or consider other tools such as exact_code_serch.")
+            with open(absolute_file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                content = f.read()
+                return f"--- File_Source: {file_path} ---\n{content}"
+        except FileNotFoundError:
+            return f"Error: The file '{file_path}' was not found. Please verify the path."
+        except Exception as e:
+            return f"Error loading {file_path}: {str(e)}"
+    # -----------------------------------------------------------------------------
+    # Tool 5: directory look up [like ls in terminal]
+    # -----------------------------------------------------------------------------
+    @tool
+    def list_directory_contents(directory_path: str) -> str:
+        """
+        List the contents of a specific directory within the repository.
+        Use this tool to explore the folder structure, see what files exist,
+        and understand how the codebase is organized.
+        Input should be a relative path from the repository root (e.g., 'repo_name/components','repo_name','repo_name/data/readmes/).
+        """
+        try:
+            # 1. Security & Path Resolution (Crucial!)
+            base_path = Path(repo_storage).resolve()
+            # Handle cases where the LLM passes absolute paths or starts with '/'
+            clean_path = directory_path.lstrip('/')
+            target_path = (base_path / clean_path).resolve()
+            # Prevent Path Traversal Attacks (e.g., agent trying to read '../../etc/passwd')
+            if not target_path.is_relative_to(base_path):
+                return "Error: Access denied. You cannot read directories outside the repository root."
+            # 2. State Checking
+            if not target_path.exists():
+                return f"Error: The directory '{directory_path}' does not exist in this repository."
+            if not target_path.is_dir():
+                return (f"Error: '{directory_path}' is a file, not a directory. "
+                        f"If you want to read it, use the get_specific_file tool.")
+            # 3. Gather Context-Rich Contents
+            items = []
+            for entry in os.scandir(target_path):
+                # Skip annoying OS files
+                if entry.name in ['.DS_Store', 'Thumbs.db']:
+                    continue
+                if entry.is_dir():
+                    items.append(f"📁 [DIR]  {entry.name}/")
+                else:
+                    # Add file sizes so the agent knows if a file is safe to read whole
+                    size_kb = entry.stat().st_size / 1024
+                    items.append(f"📄 [FILE] {entry.name} ({size_kb:.1f} KB)")
+            # Sort directories first, then files alphabetically
+            items.sort(key=lambda x: (not x.startswith("📁"), x.lower()))
+            if not items:
+                return f"The directory '{directory_path}' is completely empty."
+            # 4. Context Window Protection
+            MAX_ITEMS = 200
+            if len(items) > MAX_ITEMS:
+                truncated_count = len(items) - MAX_ITEMS
+                items = items[:MAX_ITEMS]
+                items.append(f"\n... (Output truncated: {truncated_count} more items not shown to save space) ...")
+            return f"--- Contents of /{clean_path} ---\n" + "\n".join(items)
+        except Exception as e:
+            return f"An error occurred while reading the directory: {str(e)}"
+    # -----------------------------------------------------------------------------
+    # Tool 6: find_file_path_by_pattern
+    # -----------------------------------------------------------------------------
+    @tool
+    def find_file_path_by_pattern(filename_pattern: str) -> str:
+        """
+        Search the repository for files matching a specific name or pattern.
+        Use this tool when you know the name of the file or script you are looking for
+        (e.g., 'build_npm_package.py' or '*.md').
+        Input should be a filename or glob pattern.
+        """
+        try:
+            base_path = repo_storage.resolve()
+            matches = []
+            # Walk through all files
+            for file_path in base_path.rglob("*"):
+                if file_path.is_file():
+                    # Check if the filename matches the pattern
+                    if fnmatch.fnmatch(file_path.name.lower(), filename_pattern.lower()):
+                        rel_path = file_path.relative_to(base_path)
+                        matches.append(rel_path.as_posix())
+                        if len(matches) >= 200:
+                            output = '\n'.join(matches)
+                            return f"--- FOUND FILES(truncated to 200) ---\n{output}"
+            if not matches:
+                return f"No files found matching the name '{filename_pattern}'."
+            output = '\n'.join(matches)
+            return f"--- FOUND FILES ---\n{output}"
+        except Exception as e:
+            return f"File search error: {str(e)}"
+    tools = [ exact_code_search, get_specific_file, list_directory_contents, find_file_path_by_pattern]
+    if is_vector_db_created :
+        tools.extend([semantic_code_search,keyword_code_search])
+    return tools