Spaces:

HimanshuGoyal2004
/

github-mcp-server

Running

App Files Files Community

HimanshuGoyal2004 commited on Oct 14, 2025

Commit

bb6f140

1 Parent(s): 1168745

fix

Browse files

Files changed (1) hide show

app.py +43 -46

app.py CHANGED Viewed

@@ -37,63 +37,61 @@ class GitHubMCPServer:
             # Load CVE dataset from Hugging Face
             # Login using `huggingface-cli login` to access this dataset
-            dataset = load_dataset("Baction/cve", split="train")
             print(f"📊 Loaded {len(dataset)} CVE records from Hugging Face")
             # Debug: Print first few records to understand dataset structure
             print("🔍 Dataset structure analysis:")
-            for i in range(min(3, len(dataset))):
                 print(f"Record {i}: {dict(dataset[i])}")
-            # Create documents from CVE data
             documents = []
             for idx, record in enumerate(dataset):
-                # Extract relevant fields from the dataset - check multiple possible field names
-                cve_id = (record.get('cve_id') or
-                         record.get('CVE_ID') or
-                         record.get('id') or
-                         record.get('cve') or
-                         f'CVE-UNKNOWN-{idx}')
-                cwe_code = record.get('cwe_code', record.get('CWE', 'Unknown'))
-                cwe_name = record.get('cwe_name', record.get('cwe_description', 'Unknown'))
-                cvss_score = record.get('cvss_score', record.get('cvss', record.get('CVSS', 'N/A')))
-                summary = (record.get('summary') or
-                          record.get('description') or
-                          record.get('Description') or
-                          'No summary available')
-                # Skip records without essential information or invalid CVE IDs
-                if not summary or summary == 'No summary available':
                     continue
                 # Validate CVE ID format (should be CVE-YYYY-NNNNN)
                 import re
                 if not re.match(r'^CVE-\d{4}-\d+$', str(cve_id)):
-                    print(f"⚠️ Skipping invalid CVE ID: {cve_id}")
                     continue
-                # Create document content
                 content = f"""
 CVE ID: {cve_id}
-CWE Code: {cwe_code}
-CWE Name: {cwe_name}
-CVSS Score: {cvss_score}
-Summary: {summary}
 """
                 # Create metadata
                 metadata = {
                     'cve_id': str(cve_id),
-                    'cwe_code': str(cwe_code),
-                    'cwe_name': str(cwe_name),
-                    'cvss': cvss_score,
                 }
                 documents.append(Document(page_content=content.strip(), metadata=metadata))
-            print(f"📝 Created {len(documents)} CVE documents")
             # Split documents for better retrieval
             text_splitter = RecursiveCharacterTextSplitter(
@@ -116,8 +114,9 @@ Summary: {summary}
         except Exception as e:
             print(f"❌ Error initializing CVE retriever: {str(e)}")
-            print("💡 Make sure you have access to the Hugging Face dataset 'Baction/cve'")
             print("💡 You may need to login with: huggingface-cli login")
             self.cve_retriever = None
     def get_repository_info(self, owner: str, repo: str) -> dict:
@@ -235,26 +234,22 @@ Summary: {summary}
                 metadata = doc.metadata
                 result += f"**Result {i}:**\n"
                 result += f"- **CVE ID**: {metadata.get('cve_id', 'Unknown')}\n"
-                result += f"- **CWE Code**: {metadata.get('cwe_code', 'Unknown')}\n"
-                result += f"- **CWE Name**: {metadata.get('cwe_name', 'Unknown')}\n"
-                result += f"- **CVSS Score**: {metadata.get('cvss', 'N/A')}\n"
-                # Extract summary from content
-                content_lines = doc.page_content.split('\n')
-                summary_line = next((line for line in content_lines if line.startswith('Summary:')), '')
-                summary = summary_line.replace('Summary: ', '').strip() if summary_line else 'No summary available'
-                result += f"- **Description**: {summary[:200]}{'...' if len(summary) > 200 else ''}\n"
                 result += "---\n"
             # Add summary of common patterns
             cve_ids = [doc.metadata.get('cve_id') for doc in docs if doc.metadata.get('cve_id')]
-            cwe_codes = [doc.metadata.get('cwe_code') for doc in docs if doc.metadata.get('cwe_code') and doc.metadata.get('cwe_code') != 'Unknown']
-            unique_cwes = list(set(cwe_codes))
             result += f"\n**📊 Analysis Summary:**\n"
             result += f"- **CVE Examples**: {', '.join(cve_ids[:3])}{'...' if len(cve_ids) > 3 else ''}\n"
-            result += f"- **Common CWE Codes**: {', '.join(unique_cwes[:5])}\n"
             result += f"- **Total Matches**: {len(docs)}\n"
             return result
@@ -288,13 +283,15 @@ Summary: {summary}
                 metadata = doc.metadata
                 cve_id = metadata.get('cve_id', 'Unknown')
-                # Extract summary from content
-                content_lines = doc.page_content.split('\n')
-                summary_line = next((line for line in content_lines if line.startswith('Summary:')), '')
-                summary = summary_line.replace('Summary: ', '').strip() if summary_line else 'No summary available'
                 result += f"{i}. {cve_id}\n"
-                result += f"   {summary}\n\n"
             return result.strip()

             # Load CVE dataset from Hugging Face
             # Login using `huggingface-cli login` to access this dataset
+            dataset = load_dataset("CIRCL/vulnerability", split="train")
             print(f"📊 Loaded {len(dataset)} CVE records from Hugging Face")
             # Debug: Print first few records to understand dataset structure
             print("🔍 Dataset structure analysis:")
+            print(f"Dataset columns: {dataset.column_names}")
+            for i in range(min(2, len(dataset))):
                 print(f"Record {i}: {dict(dataset[i])}")
+            # Create documents from CVE data using CIRCL/vulnerability dataset structure
             documents = []
+            valid_count = 0
+            skipped_count = 0
             for idx, record in enumerate(dataset):
+                # Extract fields from CIRCL/vulnerability dataset
+                # Columns: id, title, description, cpes
+                cve_id = record.get('id', '')
+                # Skip title as most values are null
+                description = record.get('description', '')
+                # Skip cpes for now but may use later
+                # Skip records without essential information
+                if not cve_id or not description:
+                    skipped_count += 1
                     continue
                 # Validate CVE ID format (should be CVE-YYYY-NNNNN)
                 import re
                 if not re.match(r'^CVE-\d{4}-\d+$', str(cve_id)):
+                    skipped_count += 1
                     continue
+                # Create document content using only available fields
                 content = f"""
 CVE ID: {cve_id}
+Description: {description}
 """
                 # Create metadata
                 metadata = {
                     'cve_id': str(cve_id),
+                    'description': str(description)
                 }
                 documents.append(Document(page_content=content.strip(), metadata=metadata))
+                valid_count += 1
+            print(f"📝 Created {valid_count} valid CVE documents (skipped {skipped_count} invalid records)")
+            if not documents:
+                print("❌ No valid CVE documents found in dataset")
+                self.cve_retriever = None
+                return
             # Split documents for better retrieval
             text_splitter = RecursiveCharacterTextSplitter(
         except Exception as e:
             print(f"❌ Error initializing CVE retriever: {str(e)}")
+            print("💡 Make sure you have access to the Hugging Face dataset 'CIRCL/vulnerability'")
             print("💡 You may need to login with: huggingface-cli login")
+            print("💡 Dataset columns should be: id, title, description, cpes")
             self.cve_retriever = None
     def get_repository_info(self, owner: str, repo: str) -> dict:
                 metadata = doc.metadata
                 result += f"**Result {i}:**\n"
                 result += f"- **CVE ID**: {metadata.get('cve_id', 'Unknown')}\n"
+                # Extract description from content or metadata
+                description = metadata.get('description', '')
+                if not description:
+                    content_lines = doc.page_content.split('\n')
+                    desc_line = next((line for line in content_lines if line.startswith('Description:')), '')
+                    description = desc_line.replace('Description: ', '').strip() if desc_line else 'No description available'
+                result += f"- **Description**: {description[:200]}{'...' if len(description) > 200 else ''}\n"
                 result += "---\n"
             # Add summary of common patterns
             cve_ids = [doc.metadata.get('cve_id') for doc in docs if doc.metadata.get('cve_id')]
             result += f"\n**📊 Analysis Summary:**\n"
             result += f"- **CVE Examples**: {', '.join(cve_ids[:3])}{'...' if len(cve_ids) > 3 else ''}\n"
             result += f"- **Total Matches**: {len(docs)}\n"
             return result
                 metadata = doc.metadata
                 cve_id = metadata.get('cve_id', 'Unknown')
+                # Extract description from metadata or content
+                description = metadata.get('description', '')
+                if not description:
+                    content_lines = doc.page_content.split('\n')
+                    desc_line = next((line for line in content_lines if line.startswith('Description:')), '')
+                    description = desc_line.replace('Description: ', '').strip() if desc_line else 'No description available'
                 result += f"{i}. {cve_id}\n"
+                result += f"   {description[:150]}{'...' if len(description) > 150 else ''}\n\n"
             return result.strip()