HimanshuGoyal2004 commited on
Commit
bb6f140
Β·
1 Parent(s): 1168745
Files changed (1) hide show
  1. app.py +43 -46
app.py CHANGED
@@ -37,63 +37,61 @@ class GitHubMCPServer:
37
 
38
  # Load CVE dataset from Hugging Face
39
  # Login using `huggingface-cli login` to access this dataset
40
- dataset = load_dataset("Baction/cve", split="train")
41
 
42
  print(f"πŸ“Š Loaded {len(dataset)} CVE records from Hugging Face")
43
 
44
  # Debug: Print first few records to understand dataset structure
45
  print("πŸ” Dataset structure analysis:")
46
- for i in range(min(3, len(dataset))):
 
47
  print(f"Record {i}: {dict(dataset[i])}")
48
 
49
- # Create documents from CVE data
50
  documents = []
 
 
 
51
  for idx, record in enumerate(dataset):
52
- # Extract relevant fields from the dataset - check multiple possible field names
53
- cve_id = (record.get('cve_id') or
54
- record.get('CVE_ID') or
55
- record.get('id') or
56
- record.get('cve') or
57
- f'CVE-UNKNOWN-{idx}')
58
-
59
- cwe_code = record.get('cwe_code', record.get('CWE', 'Unknown'))
60
- cwe_name = record.get('cwe_name', record.get('cwe_description', 'Unknown'))
61
- cvss_score = record.get('cvss_score', record.get('cvss', record.get('CVSS', 'N/A')))
62
- summary = (record.get('summary') or
63
- record.get('description') or
64
- record.get('Description') or
65
- 'No summary available')
66
 
67
- # Skip records without essential information or invalid CVE IDs
68
- if not summary or summary == 'No summary available':
 
69
  continue
70
 
71
  # Validate CVE ID format (should be CVE-YYYY-NNNNN)
72
  import re
73
  if not re.match(r'^CVE-\d{4}-\d+$', str(cve_id)):
74
- print(f"⚠️ Skipping invalid CVE ID: {cve_id}")
75
  continue
76
 
77
- # Create document content
78
  content = f"""
79
  CVE ID: {cve_id}
80
- CWE Code: {cwe_code}
81
- CWE Name: {cwe_name}
82
- CVSS Score: {cvss_score}
83
- Summary: {summary}
84
  """
85
 
86
  # Create metadata
87
  metadata = {
88
  'cve_id': str(cve_id),
89
- 'cwe_code': str(cwe_code),
90
- 'cwe_name': str(cwe_name),
91
- 'cvss': cvss_score,
92
  }
93
 
94
  documents.append(Document(page_content=content.strip(), metadata=metadata))
 
 
 
95
 
96
- print(f"πŸ“ Created {len(documents)} CVE documents")
 
 
 
97
 
98
  # Split documents for better retrieval
99
  text_splitter = RecursiveCharacterTextSplitter(
@@ -116,8 +114,9 @@ Summary: {summary}
116
 
117
  except Exception as e:
118
  print(f"❌ Error initializing CVE retriever: {str(e)}")
119
- print("πŸ’‘ Make sure you have access to the Hugging Face dataset 'Baction/cve'")
120
  print("πŸ’‘ You may need to login with: huggingface-cli login")
 
121
  self.cve_retriever = None
122
 
123
  def get_repository_info(self, owner: str, repo: str) -> dict:
@@ -235,26 +234,22 @@ Summary: {summary}
235
  metadata = doc.metadata
236
  result += f"**Result {i}:**\n"
237
  result += f"- **CVE ID**: {metadata.get('cve_id', 'Unknown')}\n"
238
- result += f"- **CWE Code**: {metadata.get('cwe_code', 'Unknown')}\n"
239
- result += f"- **CWE Name**: {metadata.get('cwe_name', 'Unknown')}\n"
240
- result += f"- **CVSS Score**: {metadata.get('cvss', 'N/A')}\n"
241
 
242
- # Extract summary from content
243
- content_lines = doc.page_content.split('\n')
244
- summary_line = next((line for line in content_lines if line.startswith('Summary:')), '')
245
- summary = summary_line.replace('Summary: ', '').strip() if summary_line else 'No summary available'
 
 
246
 
247
- result += f"- **Description**: {summary[:200]}{'...' if len(summary) > 200 else ''}\n"
248
  result += "---\n"
249
 
250
  # Add summary of common patterns
251
  cve_ids = [doc.metadata.get('cve_id') for doc in docs if doc.metadata.get('cve_id')]
252
- cwe_codes = [doc.metadata.get('cwe_code') for doc in docs if doc.metadata.get('cwe_code') and doc.metadata.get('cwe_code') != 'Unknown']
253
- unique_cwes = list(set(cwe_codes))
254
 
255
  result += f"\n**πŸ“Š Analysis Summary:**\n"
256
  result += f"- **CVE Examples**: {', '.join(cve_ids[:3])}{'...' if len(cve_ids) > 3 else ''}\n"
257
- result += f"- **Common CWE Codes**: {', '.join(unique_cwes[:5])}\n"
258
  result += f"- **Total Matches**: {len(docs)}\n"
259
 
260
  return result
@@ -288,13 +283,15 @@ Summary: {summary}
288
  metadata = doc.metadata
289
  cve_id = metadata.get('cve_id', 'Unknown')
290
 
291
- # Extract summary from content
292
- content_lines = doc.page_content.split('\n')
293
- summary_line = next((line for line in content_lines if line.startswith('Summary:')), '')
294
- summary = summary_line.replace('Summary: ', '').strip() if summary_line else 'No summary available'
 
 
295
 
296
  result += f"{i}. {cve_id}\n"
297
- result += f" {summary}\n\n"
298
 
299
  return result.strip()
300
 
 
37
 
38
  # Load CVE dataset from Hugging Face
39
  # Login using `huggingface-cli login` to access this dataset
40
+ dataset = load_dataset("CIRCL/vulnerability", split="train")
41
 
42
  print(f"πŸ“Š Loaded {len(dataset)} CVE records from Hugging Face")
43
 
44
  # Debug: Print first few records to understand dataset structure
45
  print("πŸ” Dataset structure analysis:")
46
+ print(f"Dataset columns: {dataset.column_names}")
47
+ for i in range(min(2, len(dataset))):
48
  print(f"Record {i}: {dict(dataset[i])}")
49
 
50
+ # Create documents from CVE data using CIRCL/vulnerability dataset structure
51
  documents = []
52
+ valid_count = 0
53
+ skipped_count = 0
54
+
55
  for idx, record in enumerate(dataset):
56
+ # Extract fields from CIRCL/vulnerability dataset
57
+ # Columns: id, title, description, cpes
58
+ cve_id = record.get('id', '')
59
+ # Skip title as most values are null
60
+ description = record.get('description', '')
61
+ # Skip cpes for now but may use later
 
 
 
 
 
 
 
 
62
 
63
+ # Skip records without essential information
64
+ if not cve_id or not description:
65
+ skipped_count += 1
66
  continue
67
 
68
  # Validate CVE ID format (should be CVE-YYYY-NNNNN)
69
  import re
70
  if not re.match(r'^CVE-\d{4}-\d+$', str(cve_id)):
71
+ skipped_count += 1
72
  continue
73
 
74
+ # Create document content using only available fields
75
  content = f"""
76
  CVE ID: {cve_id}
77
+ Description: {description}
 
 
 
78
  """
79
 
80
  # Create metadata
81
  metadata = {
82
  'cve_id': str(cve_id),
83
+ 'description': str(description)
 
 
84
  }
85
 
86
  documents.append(Document(page_content=content.strip(), metadata=metadata))
87
+ valid_count += 1
88
+
89
+ print(f"πŸ“ Created {valid_count} valid CVE documents (skipped {skipped_count} invalid records)")
90
 
91
+ if not documents:
92
+ print("❌ No valid CVE documents found in dataset")
93
+ self.cve_retriever = None
94
+ return
95
 
96
  # Split documents for better retrieval
97
  text_splitter = RecursiveCharacterTextSplitter(
 
114
 
115
  except Exception as e:
116
  print(f"❌ Error initializing CVE retriever: {str(e)}")
117
+ print("πŸ’‘ Make sure you have access to the Hugging Face dataset 'CIRCL/vulnerability'")
118
  print("πŸ’‘ You may need to login with: huggingface-cli login")
119
+ print("πŸ’‘ Dataset columns should be: id, title, description, cpes")
120
  self.cve_retriever = None
121
 
122
  def get_repository_info(self, owner: str, repo: str) -> dict:
 
234
  metadata = doc.metadata
235
  result += f"**Result {i}:**\n"
236
  result += f"- **CVE ID**: {metadata.get('cve_id', 'Unknown')}\n"
 
 
 
237
 
238
+ # Extract description from content or metadata
239
+ description = metadata.get('description', '')
240
+ if not description:
241
+ content_lines = doc.page_content.split('\n')
242
+ desc_line = next((line for line in content_lines if line.startswith('Description:')), '')
243
+ description = desc_line.replace('Description: ', '').strip() if desc_line else 'No description available'
244
 
245
+ result += f"- **Description**: {description[:200]}{'...' if len(description) > 200 else ''}\n"
246
  result += "---\n"
247
 
248
  # Add summary of common patterns
249
  cve_ids = [doc.metadata.get('cve_id') for doc in docs if doc.metadata.get('cve_id')]
 
 
250
 
251
  result += f"\n**πŸ“Š Analysis Summary:**\n"
252
  result += f"- **CVE Examples**: {', '.join(cve_ids[:3])}{'...' if len(cve_ids) > 3 else ''}\n"
 
253
  result += f"- **Total Matches**: {len(docs)}\n"
254
 
255
  return result
 
283
  metadata = doc.metadata
284
  cve_id = metadata.get('cve_id', 'Unknown')
285
 
286
+ # Extract description from metadata or content
287
+ description = metadata.get('description', '')
288
+ if not description:
289
+ content_lines = doc.page_content.split('\n')
290
+ desc_line = next((line for line in content_lines if line.startswith('Description:')), '')
291
+ description = desc_line.replace('Description: ', '').strip() if desc_line else 'No description available'
292
 
293
  result += f"{i}. {cve_id}\n"
294
+ result += f" {description[:150]}{'...' if len(description) > 150 else ''}\n\n"
295
 
296
  return result.strip()
297