HimanshuGoyal2004 commited on
Commit
2c84c03
Β·
1 Parent(s): bb6f140
Files changed (1) hide show
  1. app.py +29 -39
app.py CHANGED
@@ -37,45 +37,34 @@ class GitHubMCPServer:
37
 
38
  # Load CVE dataset from Hugging Face
39
  # Login using `huggingface-cli login` to access this dataset
40
- dataset = load_dataset("CIRCL/vulnerability", split="train")
41
 
42
- print(f"πŸ“Š Loaded {len(dataset)} CVE records from Hugging Face")
43
 
44
  # Debug: Print first few records to understand dataset structure
45
  print("πŸ” Dataset structure analysis:")
46
- print(f"Dataset columns: {dataset.column_names}")
47
- for i in range(min(2, len(dataset))):
48
- print(f"Record {i}: {dict(dataset[i])}")
49
-
50
- # Create documents from CVE data using CIRCL/vulnerability dataset structure
51
- documents = []
52
- valid_count = 0
53
- skipped_count = 0
54
-
55
- for idx, record in enumerate(dataset):
56
- # Extract fields from CIRCL/vulnerability dataset
57
- # Columns: id, title, description, cpes
 
58
  cve_id = record.get('id', '')
59
- # Skip title as most values are null
60
  description = record.get('description', '')
61
- # Skip cpes for now but may use later
62
 
63
  # Skip records without essential information
64
  if not cve_id or not description:
65
- skipped_count += 1
66
  continue
67
 
68
- # Validate CVE ID format (should be CVE-YYYY-NNNNN)
69
- import re
70
- if not re.match(r'^CVE-\d{4}-\d+$', str(cve_id)):
71
- skipped_count += 1
72
- continue
73
-
74
- # Create document content using only available fields
75
- content = f"""
76
- CVE ID: {cve_id}
77
- Description: {description}
78
- """
79
 
80
  # Create metadata
81
  metadata = {
@@ -83,34 +72,35 @@ Description: {description}
83
  'description': str(description)
84
  }
85
 
86
- documents.append(Document(page_content=content.strip(), metadata=metadata))
87
- valid_count += 1
88
 
89
- print(f"πŸ“ Created {valid_count} valid CVE documents (skipped {skipped_count} invalid records)")
90
 
91
- if not documents:
92
  print("❌ No valid CVE documents found in dataset")
93
  self.cve_retriever = None
94
  return
95
 
96
- # Split documents for better retrieval
97
  text_splitter = RecursiveCharacterTextSplitter(
98
- chunk_size=500, # Increased chunk size for better context
99
- chunk_overlap=50,
100
  add_start_index=True,
101
  strip_whitespace=True,
102
- separators=["\n\n", "\n", ".", " "]
103
  )
104
 
105
- processed_docs = text_splitter.split_documents(documents)
 
 
106
 
107
  # Initialize BM25 retriever
108
  self.cve_retriever = BM25Retriever.from_documents(
109
- processed_docs,
110
  k=3
111
  )
112
 
113
- print(f"βœ… CVE Retriever initialized with {len(processed_docs)} document chunks")
114
 
115
  except Exception as e:
116
  print(f"❌ Error initializing CVE retriever: {str(e)}")
 
37
 
38
  # Load CVE dataset from Hugging Face
39
  # Login using `huggingface-cli login` to access this dataset
40
+ knowledge_base = load_dataset("CIRCL/vulnerability", split="train")
41
 
42
+ print(f"πŸ“Š Loaded {len(knowledge_base)} vulnerability records from Hugging Face")
43
 
44
  # Debug: Print first few records to understand dataset structure
45
  print("πŸ” Dataset structure analysis:")
46
+ print(f"Dataset columns: {knowledge_base.column_names}")
47
+ for i in range(min(2, len(knowledge_base))):
48
+ print(f"Record {i}: {dict(knowledge_base[i])}")
49
+
50
+ # Filter to include only CVE entries (not GHSA)
51
+ print("πŸ” Filtering for CVE entries only...")
52
+ cve_dataset = knowledge_base.filter(lambda row: str(row["id"]).startswith("CVE-"))
53
+
54
+ print(f"πŸ“Š Filtered to {len(cve_dataset)} CVE records (excluded GHSA entries)")
55
+
56
+ # Convert dataset entries to Document objects with metadata
57
+ source_docs = []
58
+ for record in cve_dataset:
59
  cve_id = record.get('id', '')
 
60
  description = record.get('description', '')
 
61
 
62
  # Skip records without essential information
63
  if not cve_id or not description:
 
64
  continue
65
 
66
+ # Create document content
67
+ content = f"CVE ID: {cve_id}\nDescription: {description}"
 
 
 
 
 
 
 
 
 
68
 
69
  # Create metadata
70
  metadata = {
 
72
  'description': str(description)
73
  }
74
 
75
+ source_docs.append(Document(page_content=content, metadata=metadata))
 
76
 
77
+ print(f"πŸ“ Created {len(source_docs)} CVE document objects")
78
 
79
+ if not source_docs:
80
  print("❌ No valid CVE documents found in dataset")
81
  self.cve_retriever = None
82
  return
83
 
84
+ # Split documents into smaller chunks for better retrieval
85
  text_splitter = RecursiveCharacterTextSplitter(
86
+ chunk_size=500, # Characters per chunk
87
+ chunk_overlap=50, # Overlap between chunks to maintain context
88
  add_start_index=True,
89
  strip_whitespace=True,
90
+ separators=["\n\n", "\n", ".", " ", ""], # Priority order for splitting
91
  )
92
 
93
+ docs_processed = text_splitter.split_documents(source_docs)
94
+
95
+ print(f"πŸ“š Knowledge base prepared with {len(docs_processed)} document chunks")
96
 
97
  # Initialize BM25 retriever
98
  self.cve_retriever = BM25Retriever.from_documents(
99
+ docs_processed,
100
  k=3
101
  )
102
 
103
+ print(f"βœ… CVE Retriever initialized with {len(docs_processed)} document chunks")
104
 
105
  except Exception as e:
106
  print(f"❌ Error initializing CVE retriever: {str(e)}")