Spaces:

Agents-MCP-Hackathon
/

DSATP_AI_cybersecurity_agent

Runtime error

App Files Files Community

Rithvickkr commited on Jun 9, 2025

Commit

6e9c0fd

1 Parent(s): 21b1e62

Integrated NVD API via Space variables, fixed multi-threat detection and irrelevant CVEs

Browse files

Files changed (2) hide show

.gitignore +1 -0
app.py +92 -69

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ corpus/cache/

app.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import gradio as gr
 import requests
-from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
-from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 import os
 import re
 import ast
@@ -12,32 +10,20 @@ import time
 import logging
 from retrying import retry
 import base64
 # Set up logging
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Suppress Hugging Face symlink warning
 os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
 # Modal Mistral-7B API endpoint
-MODAL_API = "https://rithvickkumar27--mistral-7b-api-analyze.modal.run"
-# Configure LlamaIndex
-Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
-# Initialize LlamaIndex
-def init_llama_index():
-    try:
-        documents = SimpleDirectoryReader("corpus", filename_as_id=True).load_data()
-        logger.info(f"Loaded {len(documents)} corpus documents")
-        return VectorStoreIndex.from_documents(documents)
-    except Exception as e:
-        logger.error(f"Error loading corpus: {e}")
-        return None
-index = init_llama_index()
-query_engine = index.as_retriever() if index else None
 # Retry decorator for Mistral-7B API
 @retry(stop_max_attempt_number=3, wait_fixed=2000)
@@ -61,25 +47,63 @@ def call_mistral_llm(prompt):
         logger.error(f"Mistral API request failed: {e}")
         raise
 # Basic Python code analysis
 def analyze_python_code(content: str) -> dict:
     try:
         tree = ast.parse(content)
         suspicious_patterns = []
         for node in ast.walk(tree):
-            # Check for Base64 decoding
             if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
                 if node.func.attr == 'b64decode' and isinstance(node.func.value, ast.Name) and node.func.value.id == 'base64':
                     suspicious_patterns.append("Base64 decoding detected")
-            # Check for exec usage
             if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == 'exec':
-                suspicious_patterns.append("Dynamic code execution (exec) detected")
-            # Check for urllib.request or similar imports
             if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):
                 for name in (node.names if isinstance(node, ast.Import) else node.names):
                     if name.name in ['urllib', 'urllib.request', 'requests']:
                         suspicious_patterns.append(f"Suspicious import: {name.name}")
-            # Check for suspicious URLs in strings
             if isinstance(node, ast.Str) or (isinstance(node, ast.Constant) and isinstance(node.value, str)):
                 if re.search(r'http[s]?://.*(evil|malicious|bad)[^\s]*', node.value, re.IGNORECASE):
                     suspicious_patterns.append(f"Suspicious URL: {node.value}")
@@ -88,7 +112,7 @@ def analyze_python_code(content: str) -> dict:
                 "classification": "Malware Detected",
                 "severity": "Critical",
                 "mitigation": "Quarantine file, run antivirus, block suspicious URLs",
-                "confidence": 0.95,
                 "details": suspicious_patterns
             }
     except SyntaxError:
@@ -233,10 +257,10 @@ def dsatp_parse_log(text: str) -> dict:
         severity_order = {"Critical": 3, "High": 2, "Medium": 1, "Safe": 0}
         highest_threat = max(detected_threats, key=lambda x: (severity_order.get(x["severity"], 0), x["confidence"]))
         logger.info(f"Detected threats: {len(detected_threats)}, Selected: {highest_threat}")
-        return highest_threat
     logger.info("No threats detected")
-    return {"classification": "No Threat", "severity": "Safe", "mitigation": "None", "confidence": 0.5}
 # Enhanced DSATP YARA scanning
 def dsatp_yara_scan(file_path: str) -> dict:
@@ -249,7 +273,7 @@ def dsatp_yara_scan(file_path: str) -> dict:
         if file_path.endswith('.py'):
             python_analysis = analyze_python_code(content)
             if python_analysis:
-                return python_analysis
         import yara
         rules = yara.compile(source="""
@@ -352,7 +376,7 @@ def dsatp_yara_scan(file_path: str) -> dict:
                         "classification": "Malware Detected",
                         "severity": "Critical",
                         "mitigation": "Quarantine file, run antivirus",
-                        "confidence": 0.95
                     })
                 elif match.rule == "SuspiciousBehavior":
                     detected_threats.append({
@@ -402,18 +426,19 @@ def dsatp_yara_scan(file_path: str) -> dict:
             severity_order = {"Critical": 3, "High": 2, "Medium": 1, "Safe": 0}
             highest_threat = max(detected_threats, key=lambda x: (severity_order.get(x["severity"], 0), x["confidence"]))
             logger.info(f"YARA scan detected threats: {len(detected_threats)}, Selected: {highest_threat}")
-            return highest_threat
         logger.info("YARA scan: No threats detected")
         return {
             "classification": "No Malware",
             "severity": "Safe",
             "mitigation": "None",
-            "confidence": 0.7
         }
     except Exception as e:
         logger.error(f"YARA scan error: {e}")
-        return {"error": str(e), "severity": "Unknown", "mitigation": "Check file format"}
 # Chatbot function
 def chatbot_response(user_input, file, history, state):
@@ -428,56 +453,54 @@ def chatbot_response(user_input, file, history, state):
         try:
             input_text = open(file.name, "r").read()
             scan_result = dsatp_yara_scan(file.name)
-            all_threats.append(scan_result)
         except Exception as e:
-            scan_result = {"error": f"File error: {e}", "severity": "Unknown", "mitigation": "Check file"}
     else:
         scan_result = dsatp_parse_log(input_text)
-        all_threats.append(scan_result)
     context_str = "No relevant vulnerabilities found."
-    if query_engine:
         try:
             # Map classification to precise keywords for relevant CVEs
             threat_keywords = {
-                "Brute-Force Attempt": "brute force, ssh, login attempt, authentication failure, openssh, password attack, cwe-287, cwe-307",
-                "Malware Detected": "malware, trojan, ransomware, payload, malicious script, backdoor, virus, cwe-94, cwe-506, cwe-119",
-                "Network Intrusion": "firewall, intrusion, ufw, network attack, port scan, unauthorized access, cwe-284",
-                "Privilege Escalation": "privilege escalation, sudo, root, unauthorized access, cwe-269, cwe-250",
-                "Persistence Mechanism": "ssh tunnel, reverse ssh, persistence, backdoor, remote access, cwe-284",
-                "System Compromise": "compromise, breach, unauthorized access, cwe-284",
-                "Unauthorized Access": "unauthorized access, login failure, cwe-287",
-                "Resource Abuse": "resource abuse, crypto-miner, denial of service, cwe-400",
-                "Firmware Vulnerability": "firmware, vulnerability, iot, cwe-119",
-                "DDoS Attack": "ddos, denial of service, network flood, cwe-400",
-                "Phishing Attempt": "phishing, malicious url, social engineering, cwe-601",
-                "SQL Injection": "sql injection, database attack, cwe-89",
-                "Cross-Site Scripting": "xss, cross-site scripting, web attack, cwe-79",
-                "Suspicious Activity": "suspicious activity, anomaly, heuristic, cwe-693"
             }
             classification = scan_result.get("classification", "unknown")
-            keywords = threat_keywords.get(classification, "security threat")
-            query = f"Mitigation for: {keywords}"
-            results = query_engine.retrieve(query)
-            context_items = []
-            seen_cves = set()
-            for res in results[:3]:
-                # Extract CVE IDs and filter duplicates
-                cve_matches = re.findall(r'CVE-\d{4}-\d{5,7}', res.text)
-                if cve_matches and cve_matches[0] not in seen_cves:
-                    context_items.append(res.text)
-                    seen_cves.add(cve_matches[0])
-            context_str = "\n\n".join(context_items) if context_items else "No relevant vulnerabilities found for this threat."
-            logger.debug(f"LlamaIndex query: {query}, Results: {len(results)}, Unique CVEs: {len(seen_cves)}")
         except Exception as e:
-            logger.error(f"LlamaIndex error: {e}")
             context_str = f"Context error: {e}"
     if "error" not in scan_result:
-        # Prepare list of all detected threats for Mistral-7B
-        other_threats = [t for t in all_threats if t != scan_result and "error" not in t]
         other_threats_summary = ""
-        if other_threats:
             other_threats_summary = "\nOther detected threats include:\n" + "\n".join(
                 [f"- {t['classification']} (Severity: {t['severity']}, Confidence: {t['confidence']:.1f})" for t in other_threats]
             )
@@ -489,7 +512,7 @@ def chatbot_response(user_input, file, history, state):
         Mitigation: {scan_result['mitigation']}
         Confidence: {scan_result['confidence']}
         Additional Threats: {other_threats_summary}
-        Provide a concise response to the user, summarizing the primary threat and recommended actions in a professional tone. If additional threats are detected, briefly mention them but focus on the primary threat. Include actionable steps tailored to the primary threat. Do not mention vulnerabilities from the context unless they are directly related to the detected threat.
         """
         try:
             llm_response = call_mistral_llm(prompt)

 import gradio as gr
 import requests
 import os
 import re
 import ast
 import logging
 from retrying import retry
 import base64
+import pickle
 # Set up logging
 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Suppress warnings
 os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
 # Modal Mistral-7B API endpoint
+MODAL_API = os.getenv("MODAL_API", "https://rithvickkumar27--mistral-7b-api-analyze.modal.run")
+NVD_API_KEY = os.getenv("NVD_API_KEY")
+if not NVD_API_KEY:
+    logger.error("NVD_API_KEY not set in environment variables, API queries will fail")
 # Retry decorator for Mistral-7B API
 @retry(stop_max_attempt_number=3, wait_fixed=2000)
         logger.error(f"Mistral API request failed: {e}")
         raise
+# NVD API query with caching
+def query_nvd(keywords):
+    cache_dir = "corpus/cache"
+    os.makedirs(cache_dir, exist_ok=True)
+    cache_file = f"{cache_dir}/{keywords.replace(' ', '_')}.pkl"
+    # Check cache
+    if os.path.exists(cache_file):
+        try:
+            with open(cache_file, "rb") as f:
+                cached_data = pickle.load(f)
+                if time.time() - cached_data["timestamp"] < 86400:  # Cache valid for 24 hours
+                    logger.debug(f"Using cached NVD data for: {keywords}")
+                    return cached_data["results"]
+        except Exception as e:
+            logger.warning(f"Cache read error: {e}")
+    # Query NVD API
+    try:
+        url = "https://services.nvd.nist.gov/rest/json/cves/2.0"
+        params = {"keywordSearch": keywords, "resultsPerPage": 10}
+        headers = {"apiKey": NVD_API_KEY}
+        response = requests.get(url, params=params, headers=headers, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            results = [
+                f"{item['cve']['id']}: {item['cve']['descriptions'][0]['value']}"
+                for item in data.get("vulnerabilities", [])
+            ]
+            # Save to cache
+            with open(cache_file, "wb") as f:
+                pickle.dump({"timestamp": time.time(), "results": results}, f)
+            logger.info(f"Fetched {len(results)} CVEs from NVD for: {keywords}")
+            return results
+        elif response.status_code == 429:
+            logger.error("NVD rate limit exceeded")
+        else:
+            logger.error(f"NVD API error: Status {response.status_code}")
+    except Exception as e:
+        logger.error(f"NVD API request failed: {e}")
+    return None
 # Basic Python code analysis
 def analyze_python_code(content: str) -> dict:
     try:
         tree = ast.parse(content)
         suspicious_patterns = []
         for node in ast.walk(tree):
             if isinstance(node, ast.Call) and isinstance(node.func, ast.Attribute):
                 if node.func.attr == 'b64decode' and isinstance(node.func.value, ast.Name) and node.func.value.id == 'base64':
                     suspicious_patterns.append("Base64 decoding detected")
             if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == 'exec':
+                suspicious_patterns.append("Dynamic code execution (exec)")
             if isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):
                 for name in (node.names if isinstance(node, ast.Import) else node.names):
                     if name.name in ['urllib', 'urllib.request', 'requests']:
                         suspicious_patterns.append(f"Suspicious import: {name.name}")
             if isinstance(node, ast.Str) or (isinstance(node, ast.Constant) and isinstance(node.value, str)):
                 if re.search(r'http[s]?://.*(evil|malicious|bad)[^\s]*', node.value, re.IGNORECASE):
                     suspicious_patterns.append(f"Suspicious URL: {node.value}")
                 "classification": "Malware Detected",
                 "severity": "Critical",
                 "mitigation": "Quarantine file, run antivirus, block suspicious URLs",
+                "confidence": 0.9,
                 "details": suspicious_patterns
             }
     except SyntaxError:
         severity_order = {"Critical": 3, "High": 2, "Medium": 1, "Safe": 0}
         highest_threat = max(detected_threats, key=lambda x: (severity_order.get(x["severity"], 0), x["confidence"]))
         logger.info(f"Detected threats: {len(detected_threats)}, Selected: {highest_threat}")
+        return highest_threat | {"all_threats": detected_threats}
     logger.info("No threats detected")
+    return {"classification": "No Threat", "severity": "Safe", "mitigation": "None", "confidence": 0.5, "all_threats": []}
 # Enhanced DSATP YARA scanning
 def dsatp_yara_scan(file_path: str) -> dict:
         if file_path.endswith('.py'):
             python_analysis = analyze_python_code(content)
             if python_analysis:
+                return python_analysis | {"all_threats": [python_analysis]}
         import yara
         rules = yara.compile(source="""
                         "classification": "Malware Detected",
                         "severity": "Critical",
                         "mitigation": "Quarantine file, run antivirus",
+                        "confidence": 0.9
                     })
                 elif match.rule == "SuspiciousBehavior":
                     detected_threats.append({
             severity_order = {"Critical": 3, "High": 2, "Medium": 1, "Safe": 0}
             highest_threat = max(detected_threats, key=lambda x: (severity_order.get(x["severity"], 0), x["confidence"]))
             logger.info(f"YARA scan detected threats: {len(detected_threats)}, Selected: {highest_threat}")
+            return highest_threat | {"all_threats": detected_threats}
         logger.info("YARA scan: No threats detected")
         return {
             "classification": "No Malware",
             "severity": "Safe",
             "mitigation": "None",
+            "confidence": 0.7,
+            "all_threats": []
         }
     except Exception as e:
         logger.error(f"YARA scan error: {e}")
+        return {"error": str(e), "severity": "Unknown", "mitigation": "Check file format", "all_threats": []}
 # Chatbot function
 def chatbot_response(user_input, file, history, state):
         try:
             input_text = open(file.name, "r").read()
             scan_result = dsatp_yara_scan(file.name)
         except Exception as e:
+            scan_result = {"error": f"File error: {e}", "severity": "Unknown", "mitigation": "Check file", "all_threats": []}
     else:
         scan_result = dsatp_parse_log(input_text)
+    all_threats = scan_result.get("all_threats", [])
     context_str = "No relevant vulnerabilities found."
+    if NVD_API_KEY:
         try:
             # Map classification to precise keywords for relevant CVEs
             threat_keywords = {
+                "Brute-Force Attempt": "brute force ssh login attempt authentication failure openssh password attack cwe-287 cwe-307",
+                "Malware Detected": "malware trojan ransomware payload malicious script backdoor virus cwe-94 cwe-506 cwe-119 code injection python",
+                "Network Intrusion": "firewall intrusion ufw network attack port scan unauthorized access cwe-284",
+                "Privilege Escalation": "privilege escalation sudo root unauthorized access cwe-269 cwe-250",
+                "Persistence Mechanism": "ssh tunnel reverse ssh persistence backdoor remote access cwe-284",
+                "System Compromise": "compromise breach unauthorized access cwe-284",
+                "Unauthorized Access": "unauthorized access login failure cwe-287",
+                "Resource Abuse": "resource abuse crypto-miner denial of service cwe-400",
+                "Firmware Vulnerability": "firmware vulnerability iot cwe-119",
+                "DDoS Attack": "ddos denial of service network flood cwe-400",
+                "Phishing Attempt": "phishing malicious url social engineering cwe-601",
+                "SQL Injection": "sql injection database attack cwe-89",
+                "Cross-Site Scripting": "xss cross-site scripting web attack cwe-79",
+                "Suspicious Activity": "suspicious activity anomaly heuristic cwe-693"
             }
             classification = scan_result.get("classification", "unknown")
+            keywords = threat_keywords.get(classification, "security threat").replace(',', '')
+            nvd_results = query_nvd(keywords)
+            if nvd_results:
+                context_items = []
+                seen_cves = set()
+                for result in nvd_results[:3]:
+                    cve_matches = re.findall(r'CVE-\d{4}-\d{5,7}', result)
+                    if cve_matches and cve_matches[0] not in seen_cves:
+                        if any(keyword.lower() in result.lower() for keyword in keywords.split()):
+                            context_items.append(result)
+                            seen_cves.add(cve_matches[0])
+                context_str = "\n\n".join(context_items) if context_items else "No relevant vulnerabilities found for this threat."
+            logger.debug(f"NVD query: {keywords}, Results: {len(nvd_results) if nvd_results else 0}")
         except Exception as e:
+            logger.error(f"NVD error: {e}")
             context_str = f"Context error: {e}"
     if "error" not in scan_result:
         other_threats_summary = ""
+        if len(all_threats) > 1:
+            other_threats = [t for t in all_threats if t != scan_result]
             other_threats_summary = "\nOther detected threats include:\n" + "\n".join(
                 [f"- {t['classification']} (Severity: {t['severity']}, Confidence: {t['confidence']:.1f})" for t in other_threats]
             )
         Mitigation: {scan_result['mitigation']}
         Confidence: {scan_result['confidence']}
         Additional Threats: {other_threats_summary}
+        Provide a concise response to the user, summarizing the primary threat and recommended actions in a professional tone. If additional threats are detected, briefly mention them but focus on the primary threat. Include actionable steps tailored to the primary threat. Do not mention vulnerabilities from the context unless explicitly confirmed as related to the detected threat.
         """
         try:
             llm_response = call_mistral_llm(prompt)