Spaces:

HimanshuGoyal2004
/

github-mcp-server

Running

App Files Files Community

HimanshuGoyal2004 commited on Oct 9, 2025

Commit

fbc9b21

1 Parent(s): f8c7c4a

updated retriever

Browse files

Files changed (2) hide show

app.py +142 -6
requirements.txt +6 -1

app.py CHANGED Viewed

@@ -5,12 +5,16 @@ from typing import Dict, List, Any
 import requests
 import gradio as gr
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
 class GitHubMCPServer:
-    """GitHub MCP Server for repository scanning and file access"""
     def __init__(self):
         self.github_token = os.getenv("GITHUB_TOKEN")
@@ -21,6 +25,81 @@ class GitHubMCPServer:
             "Authorization": f"token {self.github_token}",
             "Accept": "application/vnd.github.v3+json"
         }
     def get_repository_info(self, owner: str, repo: str) -> dict:
         """Get basic repository information"""
@@ -117,6 +196,52 @@ class GitHubMCPServer:
                         self._scan_directory_sync(owner, repo, item["path"], extensions, all_files)
         except Exception:
             pass
 # Initialize the GitHub MCP server
 github_server = GitHubMCPServer()
@@ -158,19 +283,30 @@ demo = gr.TabbedInterface(
             title="Scan Repository for Code Files",
             description="Scan a GitHub repository for code files with specified extensions",
             api_name="scan_repository"
         )
     ],
     [
         "Repository Info",
         "File Content",
-        "Repository Scanner"
     ],
-    title="🐙 GitHub MCP Server"
 )
 if __name__ == "__main__":
-    print("🚀 Starting GitHub MCP Server with Gradio...")
-    print("📡 Server will provide GitHub repository access via MCP")
-    print("🛠️ Available tools: repository info, file content, repository scanner")
     demo.launch(mcp_server=True)

 import requests
 import gradio as gr
 from dotenv import load_dotenv
+from datasets import load_dataset
+from langchain.docstore.document import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.retrievers import BM25Retriever
 # Load environment variables
 load_dotenv()
 class GitHubMCPServer:
+    """GitHub MCP Server for repository scanning, file access, and CVE retrieval"""
     def __init__(self):
         self.github_token = os.getenv("GITHUB_TOKEN")
             "Authorization": f"token {self.github_token}",
             "Accept": "application/vnd.github.v3+json"
         }
+        # Initialize CVE retriever
+        self.cve_retriever = None
+        self._initialize_cve_retriever()
+    def _initialize_cve_retriever(self):
+        """Initialize the CVE retriever with Hugging Face dataset"""
+        try:
+            print("🔄 Loading CVE dataset from Hugging Face...")
+            # Load CVE dataset from Hugging Face
+            # Login using `huggingface-cli login` to access this dataset
+            dataset = load_dataset("Baction/cve", split="train")
+            print(f"📊 Loaded {len(dataset)} CVE records from Hugging Face")
+            # Create documents from CVE data
+            documents = []
+            for idx, record in enumerate(dataset):
+                # Extract relevant fields from the dataset
+                cve_id = record.get('cve_id', f'CVE-{idx}')
+                cwe_code = record.get('cwe_code', 'Unknown')
+                cwe_name = record.get('cwe_name', 'Unknown')
+                cvss_score = record.get('cvss_score', record.get('cvss', 'N/A'))
+                summary = record.get('summary', record.get('description', 'No summary available'))
+                # Skip records without essential information
+                if not summary or summary == 'No summary available':
+                    continue
+                # Create document content
+                content = f"""
+CVE ID: {cve_id}
+CWE Code: {cwe_code}
+CWE Name: {cwe_name}
+CVSS Score: {cvss_score}
+Summary: {summary}
+"""
+                # Create metadata
+                metadata = {
+                    'cve_id': str(cve_id),
+                    'cwe_code': str(cwe_code),
+                    'cwe_name': str(cwe_name),
+                    'cvss': cvss_score,
+                }
+                documents.append(Document(page_content=content.strip(), metadata=metadata))
+            print(f"📝 Created {len(documents)} CVE documents")
+            # Split documents for better retrieval
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=500,  # Increased chunk size for better context
+                chunk_overlap=50,
+                add_start_index=True,
+                strip_whitespace=True,
+                separators=["\n\n", "\n", ".", " "]
+            )
+            processed_docs = text_splitter.split_documents(documents)
+            # Initialize BM25 retriever
+            self.cve_retriever = BM25Retriever.from_documents(
+                processed_docs,
+                k=10  # Return top 10 most relevant documents
+            )
+            print(f"✅ CVE Retriever initialized with {len(processed_docs)} document chunks")
+        except Exception as e:
+            print(f"❌ Error initializing CVE retriever: {str(e)}")
+            print("💡 Make sure you have access to the Hugging Face dataset 'Baction/cve'")
+            print("💡 You may need to login with: huggingface-cli login")
+            self.cve_retriever = None
     def get_repository_info(self, owner: str, repo: str) -> dict:
         """Get basic repository information"""
                         self._scan_directory_sync(owner, repo, item["path"], extensions, all_files)
         except Exception:
             pass
+    def search_cve_database(self, query: str) -> str:
+        """Search CVE database for relevant vulnerability information"""
+        if not self.cve_retriever:
+            return "❌ CVE retriever not properly initialized. Please check Hugging Face dataset access."
+        try:
+            # Retrieve relevant documents
+            docs = self.cve_retriever.invoke(query)
+            if not docs:
+                return f"No relevant CVE information found for query: '{query}'"
+            # Format the retrieved CVE information
+            result = f"🔍 **CVE Knowledge Base Results for: '{query}'**\n\n"
+            for i, doc in enumerate(docs, 1):
+                metadata = doc.metadata
+                result += f"**Result {i}:**\n"
+                result += f"- **CVE ID**: {metadata.get('cve_id', 'Unknown')}\n"
+                result += f"- **CWE Code**: {metadata.get('cwe_code', 'Unknown')}\n"
+                result += f"- **CWE Name**: {metadata.get('cwe_name', 'Unknown')}\n"
+                result += f"- **CVSS Score**: {metadata.get('cvss', 'N/A')}\n"
+                # Extract summary from content
+                content_lines = doc.page_content.split('\n')
+                summary_line = next((line for line in content_lines if line.startswith('Summary:')), '')
+                summary = summary_line.replace('Summary: ', '').strip() if summary_line else 'No summary available'
+                result += f"- **Description**: {summary[:200]}{'...' if len(summary) > 200 else ''}\n"
+                result += "---\n"
+            # Add summary of common patterns
+            cve_ids = [doc.metadata.get('cve_id') for doc in docs if doc.metadata.get('cve_id')]
+            cwe_codes = [doc.metadata.get('cwe_code') for doc in docs if doc.metadata.get('cwe_code') and doc.metadata.get('cwe_code') != 'Unknown']
+            unique_cwes = list(set(cwe_codes))
+            result += f"\n**📊 Analysis Summary:**\n"
+            result += f"- **CVE Examples**: {', '.join(cve_ids[:3])}{'...' if len(cve_ids) > 3 else ''}\n"
+            result += f"- **Common CWE Codes**: {', '.join(unique_cwes[:5])}\n"
+            result += f"- **Total Matches**: {len(docs)}\n"
+            return result
+        except Exception as e:
+            return f"❌ Error retrieving CVE information: {str(e)}"
 # Initialize the GitHub MCP server
 github_server = GitHubMCPServer()
             title="Scan Repository for Code Files",
             description="Scan a GitHub repository for code files with specified extensions",
             api_name="scan_repository"
+        ),
+        gr.Interface(
+            fn=github_server.search_cve_database,
+            inputs=[
+                gr.Textbox(label="Vulnerability Query", placeholder="SQL injection, XSS, command injection, etc.")
+            ],
+            outputs=gr.Textbox(label="CVE Search Results", lines=25),
+            title="Search CVE Database",
+            description="Search the CVE knowledge base for vulnerability patterns and CWE information",
+            api_name="search_cve_database"
         )
     ],
     [
         "Repository Info",
         "File Content",
+        "Repository Scanner",
+        "CVE Database"
     ],
+    title="🐙 GitHub MCP Server with CVE Knowledge Base"
 )
 if __name__ == "__main__":
+    print("🚀 Starting GitHub MCP Server with CVE Knowledge Base...")
+    print("📡 Server will provide GitHub repository access and CVE search via MCP")
+    print("🛠️ Available tools: repository info, file content, repository scanner, CVE database search")
     demo.launch(mcp_server=True)

requirements.txt CHANGED Viewed

@@ -5,4 +5,9 @@ mcp==1.10.1
 smolagents>=0.1.0
 requests>=2.28.0
 python-dotenv>=1.0.0
-pydantic>=2.11,<2.12

 smolagents>=0.1.0
 requests>=2.28.0
 python-dotenv>=1.0.0
+pydantic>=2.11,<2.12
+datasets>=2.0.0
+langchain>=0.1.0
+langchain-community>=0.0.20
+sentence-transformers>=2.2.0
+rank-bm25>=0.2.2