Commit
Β·
bb6f140
1
Parent(s):
1168745
fix
Browse files
app.py
CHANGED
|
@@ -37,63 +37,61 @@ class GitHubMCPServer:
|
|
| 37 |
|
| 38 |
# Load CVE dataset from Hugging Face
|
| 39 |
# Login using `huggingface-cli login` to access this dataset
|
| 40 |
-
dataset = load_dataset("
|
| 41 |
|
| 42 |
print(f"π Loaded {len(dataset)} CVE records from Hugging Face")
|
| 43 |
|
| 44 |
# Debug: Print first few records to understand dataset structure
|
| 45 |
print("π Dataset structure analysis:")
|
| 46 |
-
|
|
|
|
| 47 |
print(f"Record {i}: {dict(dataset[i])}")
|
| 48 |
|
| 49 |
-
# Create documents from CVE data
|
| 50 |
documents = []
|
|
|
|
|
|
|
|
|
|
| 51 |
for idx, record in enumerate(dataset):
|
| 52 |
-
# Extract
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
cwe_code = record.get('cwe_code', record.get('CWE', 'Unknown'))
|
| 60 |
-
cwe_name = record.get('cwe_name', record.get('cwe_description', 'Unknown'))
|
| 61 |
-
cvss_score = record.get('cvss_score', record.get('cvss', record.get('CVSS', 'N/A')))
|
| 62 |
-
summary = (record.get('summary') or
|
| 63 |
-
record.get('description') or
|
| 64 |
-
record.get('Description') or
|
| 65 |
-
'No summary available')
|
| 66 |
|
| 67 |
-
# Skip records without essential information
|
| 68 |
-
if not
|
|
|
|
| 69 |
continue
|
| 70 |
|
| 71 |
# Validate CVE ID format (should be CVE-YYYY-NNNNN)
|
| 72 |
import re
|
| 73 |
if not re.match(r'^CVE-\d{4}-\d+$', str(cve_id)):
|
| 74 |
-
|
| 75 |
continue
|
| 76 |
|
| 77 |
-
# Create document content
|
| 78 |
content = f"""
|
| 79 |
CVE ID: {cve_id}
|
| 80 |
-
|
| 81 |
-
CWE Name: {cwe_name}
|
| 82 |
-
CVSS Score: {cvss_score}
|
| 83 |
-
Summary: {summary}
|
| 84 |
"""
|
| 85 |
|
| 86 |
# Create metadata
|
| 87 |
metadata = {
|
| 88 |
'cve_id': str(cve_id),
|
| 89 |
-
'
|
| 90 |
-
'cwe_name': str(cwe_name),
|
| 91 |
-
'cvss': cvss_score,
|
| 92 |
}
|
| 93 |
|
| 94 |
documents.append(Document(page_content=content.strip(), metadata=metadata))
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
# Split documents for better retrieval
|
| 99 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
@@ -116,8 +114,9 @@ Summary: {summary}
|
|
| 116 |
|
| 117 |
except Exception as e:
|
| 118 |
print(f"β Error initializing CVE retriever: {str(e)}")
|
| 119 |
-
print("π‘ Make sure you have access to the Hugging Face dataset '
|
| 120 |
print("π‘ You may need to login with: huggingface-cli login")
|
|
|
|
| 121 |
self.cve_retriever = None
|
| 122 |
|
| 123 |
def get_repository_info(self, owner: str, repo: str) -> dict:
|
|
@@ -235,26 +234,22 @@ Summary: {summary}
|
|
| 235 |
metadata = doc.metadata
|
| 236 |
result += f"**Result {i}:**\n"
|
| 237 |
result += f"- **CVE ID**: {metadata.get('cve_id', 'Unknown')}\n"
|
| 238 |
-
result += f"- **CWE Code**: {metadata.get('cwe_code', 'Unknown')}\n"
|
| 239 |
-
result += f"- **CWE Name**: {metadata.get('cwe_name', 'Unknown')}\n"
|
| 240 |
-
result += f"- **CVSS Score**: {metadata.get('cvss', 'N/A')}\n"
|
| 241 |
|
| 242 |
-
# Extract
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
|
|
|
|
|
|
| 246 |
|
| 247 |
-
result += f"- **Description**: {
|
| 248 |
result += "---\n"
|
| 249 |
|
| 250 |
# Add summary of common patterns
|
| 251 |
cve_ids = [doc.metadata.get('cve_id') for doc in docs if doc.metadata.get('cve_id')]
|
| 252 |
-
cwe_codes = [doc.metadata.get('cwe_code') for doc in docs if doc.metadata.get('cwe_code') and doc.metadata.get('cwe_code') != 'Unknown']
|
| 253 |
-
unique_cwes = list(set(cwe_codes))
|
| 254 |
|
| 255 |
result += f"\n**π Analysis Summary:**\n"
|
| 256 |
result += f"- **CVE Examples**: {', '.join(cve_ids[:3])}{'...' if len(cve_ids) > 3 else ''}\n"
|
| 257 |
-
result += f"- **Common CWE Codes**: {', '.join(unique_cwes[:5])}\n"
|
| 258 |
result += f"- **Total Matches**: {len(docs)}\n"
|
| 259 |
|
| 260 |
return result
|
|
@@ -288,13 +283,15 @@ Summary: {summary}
|
|
| 288 |
metadata = doc.metadata
|
| 289 |
cve_id = metadata.get('cve_id', 'Unknown')
|
| 290 |
|
| 291 |
-
# Extract
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
|
|
|
|
|
|
| 295 |
|
| 296 |
result += f"{i}. {cve_id}\n"
|
| 297 |
-
result += f" {
|
| 298 |
|
| 299 |
return result.strip()
|
| 300 |
|
|
|
|
| 37 |
|
| 38 |
# Load CVE dataset from Hugging Face
|
| 39 |
# Login using `huggingface-cli login` to access this dataset
|
| 40 |
+
dataset = load_dataset("CIRCL/vulnerability", split="train")
|
| 41 |
|
| 42 |
print(f"π Loaded {len(dataset)} CVE records from Hugging Face")
|
| 43 |
|
| 44 |
# Debug: Print first few records to understand dataset structure
|
| 45 |
print("π Dataset structure analysis:")
|
| 46 |
+
print(f"Dataset columns: {dataset.column_names}")
|
| 47 |
+
for i in range(min(2, len(dataset))):
|
| 48 |
print(f"Record {i}: {dict(dataset[i])}")
|
| 49 |
|
| 50 |
+
# Create documents from CVE data using CIRCL/vulnerability dataset structure
|
| 51 |
documents = []
|
| 52 |
+
valid_count = 0
|
| 53 |
+
skipped_count = 0
|
| 54 |
+
|
| 55 |
for idx, record in enumerate(dataset):
|
| 56 |
+
# Extract fields from CIRCL/vulnerability dataset
|
| 57 |
+
# Columns: id, title, description, cpes
|
| 58 |
+
cve_id = record.get('id', '')
|
| 59 |
+
# Skip title as most values are null
|
| 60 |
+
description = record.get('description', '')
|
| 61 |
+
# Skip cpes for now but may use later
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
+
# Skip records without essential information
|
| 64 |
+
if not cve_id or not description:
|
| 65 |
+
skipped_count += 1
|
| 66 |
continue
|
| 67 |
|
| 68 |
# Validate CVE ID format (should be CVE-YYYY-NNNNN)
|
| 69 |
import re
|
| 70 |
if not re.match(r'^CVE-\d{4}-\d+$', str(cve_id)):
|
| 71 |
+
skipped_count += 1
|
| 72 |
continue
|
| 73 |
|
| 74 |
+
# Create document content using only available fields
|
| 75 |
content = f"""
|
| 76 |
CVE ID: {cve_id}
|
| 77 |
+
Description: {description}
|
|
|
|
|
|
|
|
|
|
| 78 |
"""
|
| 79 |
|
| 80 |
# Create metadata
|
| 81 |
metadata = {
|
| 82 |
'cve_id': str(cve_id),
|
| 83 |
+
'description': str(description)
|
|
|
|
|
|
|
| 84 |
}
|
| 85 |
|
| 86 |
documents.append(Document(page_content=content.strip(), metadata=metadata))
|
| 87 |
+
valid_count += 1
|
| 88 |
+
|
| 89 |
+
print(f"π Created {valid_count} valid CVE documents (skipped {skipped_count} invalid records)")
|
| 90 |
|
| 91 |
+
if not documents:
|
| 92 |
+
print("β No valid CVE documents found in dataset")
|
| 93 |
+
self.cve_retriever = None
|
| 94 |
+
return
|
| 95 |
|
| 96 |
# Split documents for better retrieval
|
| 97 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
|
| 114 |
|
| 115 |
except Exception as e:
|
| 116 |
print(f"β Error initializing CVE retriever: {str(e)}")
|
| 117 |
+
print("π‘ Make sure you have access to the Hugging Face dataset 'CIRCL/vulnerability'")
|
| 118 |
print("π‘ You may need to login with: huggingface-cli login")
|
| 119 |
+
print("π‘ Dataset columns should be: id, title, description, cpes")
|
| 120 |
self.cve_retriever = None
|
| 121 |
|
| 122 |
def get_repository_info(self, owner: str, repo: str) -> dict:
|
|
|
|
| 234 |
metadata = doc.metadata
|
| 235 |
result += f"**Result {i}:**\n"
|
| 236 |
result += f"- **CVE ID**: {metadata.get('cve_id', 'Unknown')}\n"
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
+
# Extract description from content or metadata
|
| 239 |
+
description = metadata.get('description', '')
|
| 240 |
+
if not description:
|
| 241 |
+
content_lines = doc.page_content.split('\n')
|
| 242 |
+
desc_line = next((line for line in content_lines if line.startswith('Description:')), '')
|
| 243 |
+
description = desc_line.replace('Description: ', '').strip() if desc_line else 'No description available'
|
| 244 |
|
| 245 |
+
result += f"- **Description**: {description[:200]}{'...' if len(description) > 200 else ''}\n"
|
| 246 |
result += "---\n"
|
| 247 |
|
| 248 |
# Add summary of common patterns
|
| 249 |
cve_ids = [doc.metadata.get('cve_id') for doc in docs if doc.metadata.get('cve_id')]
|
|
|
|
|
|
|
| 250 |
|
| 251 |
result += f"\n**π Analysis Summary:**\n"
|
| 252 |
result += f"- **CVE Examples**: {', '.join(cve_ids[:3])}{'...' if len(cve_ids) > 3 else ''}\n"
|
|
|
|
| 253 |
result += f"- **Total Matches**: {len(docs)}\n"
|
| 254 |
|
| 255 |
return result
|
|
|
|
| 283 |
metadata = doc.metadata
|
| 284 |
cve_id = metadata.get('cve_id', 'Unknown')
|
| 285 |
|
| 286 |
+
# Extract description from metadata or content
|
| 287 |
+
description = metadata.get('description', '')
|
| 288 |
+
if not description:
|
| 289 |
+
content_lines = doc.page_content.split('\n')
|
| 290 |
+
desc_line = next((line for line in content_lines if line.startswith('Description:')), '')
|
| 291 |
+
description = desc_line.replace('Description: ', '').strip() if desc_line else 'No description available'
|
| 292 |
|
| 293 |
result += f"{i}. {cve_id}\n"
|
| 294 |
+
result += f" {description[:150]}{'...' if len(description) > 150 else ''}\n\n"
|
| 295 |
|
| 296 |
return result.strip()
|
| 297 |
|