Spaces:

HimanshuGoyal2004
/

github-mcp-server

Running

App Files Files Community

HimanshuGoyal2004 commited on Oct 24, 2025

Commit

2c84c03

1 Parent(s): bb6f140

changes

Browse files

Files changed (1) hide show

app.py +29 -39

app.py CHANGED Viewed

@@ -37,45 +37,34 @@ class GitHubMCPServer:
             # Load CVE dataset from Hugging Face
             # Login using `huggingface-cli login` to access this dataset
-            dataset = load_dataset("CIRCL/vulnerability", split="train")
-            print(f"📊 Loaded {len(dataset)} CVE records from Hugging Face")
             # Debug: Print first few records to understand dataset structure
             print("🔍 Dataset structure analysis:")
-            print(f"Dataset columns: {dataset.column_names}")
-            for i in range(min(2, len(dataset))):
-                print(f"Record {i}: {dict(dataset[i])}")
-            # Create documents from CVE data using CIRCL/vulnerability dataset structure
-            documents = []
-            valid_count = 0
-            skipped_count = 0
-            for idx, record in enumerate(dataset):
-                # Extract fields from CIRCL/vulnerability dataset
-                # Columns: id, title, description, cpes
                 cve_id = record.get('id', '')
-                # Skip title as most values are null
                 description = record.get('description', '')
-                # Skip cpes for now but may use later
                 # Skip records without essential information
                 if not cve_id or not description:
-                    skipped_count += 1
                     continue
-                # Validate CVE ID format (should be CVE-YYYY-NNNNN)
-                import re
-                if not re.match(r'^CVE-\d{4}-\d+$', str(cve_id)):
-                    skipped_count += 1
-                    continue
-                # Create document content using only available fields
-                content = f"""
-CVE ID: {cve_id}
-Description: {description}
-"""
                 # Create metadata
                 metadata = {
@@ -83,34 +72,35 @@ Description: {description}
                     'description': str(description)
                 }
-                documents.append(Document(page_content=content.strip(), metadata=metadata))
-                valid_count += 1
-            print(f"📝 Created {valid_count} valid CVE documents (skipped {skipped_count} invalid records)")
-            if not documents:
                 print("❌ No valid CVE documents found in dataset")
                 self.cve_retriever = None
                 return
-            # Split documents for better retrieval
             text_splitter = RecursiveCharacterTextSplitter(
-                chunk_size=500,  # Increased chunk size for better context
-                chunk_overlap=50,
                 add_start_index=True,
                 strip_whitespace=True,
-                separators=["\n\n", "\n", ".", " "]
             )
-            processed_docs = text_splitter.split_documents(documents)
             # Initialize BM25 retriever
             self.cve_retriever = BM25Retriever.from_documents(
-                processed_docs,
                 k=3
             )
-            print(f"✅ CVE Retriever initialized with {len(processed_docs)} document chunks")
         except Exception as e:
             print(f"❌ Error initializing CVE retriever: {str(e)}")

             # Load CVE dataset from Hugging Face
             # Login using `huggingface-cli login` to access this dataset
+            knowledge_base = load_dataset("CIRCL/vulnerability", split="train")
+            print(f"📊 Loaded {len(knowledge_base)} vulnerability records from Hugging Face")
             # Debug: Print first few records to understand dataset structure
             print("🔍 Dataset structure analysis:")
+            print(f"Dataset columns: {knowledge_base.column_names}")
+            for i in range(min(2, len(knowledge_base))):
+                print(f"Record {i}: {dict(knowledge_base[i])}")
+            # Filter to include only CVE entries (not GHSA)
+            print("🔍 Filtering for CVE entries only...")
+            cve_dataset = knowledge_base.filter(lambda row: str(row["id"]).startswith("CVE-"))
+            print(f"📊 Filtered to {len(cve_dataset)} CVE records (excluded GHSA entries)")
+            # Convert dataset entries to Document objects with metadata
+            source_docs = []
+            for record in cve_dataset:
                 cve_id = record.get('id', '')
                 description = record.get('description', '')
                 # Skip records without essential information
                 if not cve_id or not description:
                     continue
+                # Create document content
+                content = f"CVE ID: {cve_id}\nDescription: {description}"
                 # Create metadata
                 metadata = {
                     'description': str(description)
                 }
+                source_docs.append(Document(page_content=content, metadata=metadata))
+            print(f"📝 Created {len(source_docs)} CVE document objects")
+            if not source_docs:
                 print("❌ No valid CVE documents found in dataset")
                 self.cve_retriever = None
                 return
+            # Split documents into smaller chunks for better retrieval
             text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=500,  # Characters per chunk
+                chunk_overlap=50,  # Overlap between chunks to maintain context
                 add_start_index=True,
                 strip_whitespace=True,
+                separators=["\n\n", "\n", ".", " ", ""],  # Priority order for splitting
             )
+            docs_processed = text_splitter.split_documents(source_docs)
+            print(f"📚 Knowledge base prepared with {len(docs_processed)} document chunks")
             # Initialize BM25 retriever
             self.cve_retriever = BM25Retriever.from_documents(
+                docs_processed,
                 k=3
             )
+            print(f"✅ CVE Retriever initialized with {len(docs_processed)} document chunks")
         except Exception as e:
             print(f"❌ Error initializing CVE retriever: {str(e)}")