Spaces:

EzekielMW
/

SecurityGPT

Sleeping

App Files Files Community

EzekielMW commited on Jul 18, 2025

Commit

f5089d4

verified ·

1 Parent(s): c70291d

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -28

app.py CHANGED Viewed

@@ -4,11 +4,12 @@ import pdfplumber
 import numpy as np
 import faiss
 import zipfile
 from transformers import T5Tokenizer, T5ForConditionalGeneration
 from sentence_transformers import SentenceTransformer
 # -------------------------
-# Step 1: Unzip docs.zip if exists
 # -------------------------
 def unzip_docs():
     if os.path.exists("docs.zip") and not os.path.exists("docs"):
@@ -18,12 +19,11 @@ def unzip_docs():
         print("✅ Extracted to /docs")
 # -------------------------
-# Step 2: Recursively read PDFs
 # -------------------------
 def load_docs(folder="docs"):
     all_text = ""
     found_files = []
     for root, _, files in os.walk(folder):
         for fname in files:
             if fname.lower().endswith(".pdf"):
@@ -38,17 +38,17 @@ def load_docs(folder="docs"):
     print(f"📄 Found {len(found_files)} PDF files:")
     for f in found_files:
         print(" -", f)
     return all_text
 # -------------------------
-# Chunking function
 # -------------------------
 def chunk_text(text, max_words=200):
-    import re
-    paras = re.split(r'\n{2,}', text)
     chunks, current = [], ""
-    for para in paras:
         if len((current + " " + para).split()) < max_words:
             current += " " + para
         else:
@@ -56,70 +56,80 @@ def chunk_text(text, max_words=200):
             current = para
     if current:
         chunks.append(current.strip())
     return [c for c in chunks if len(c.split()) > 20]
 # -------------------------
-# Build RAG index
 # -------------------------
 def build_index():
     unzip_docs()
     raw = load_docs("docs")
     global doc_chunks
     doc_chunks = chunk_text(raw)
     embeddings = embedder.encode(doc_chunks, convert_to_numpy=True, normalize_embeddings=True)
-    faiss_index = faiss.IndexFlatIP(embeddings.shape[1])
-    faiss_index.add(embeddings)
-    return faiss_index
 # -------------------------
-# RAG Generator
 # -------------------------
 def generate_answer(question):
     q_embed = embedder.encode([question], normalize_embeddings=True)
     D, I = index.search(np.array(q_embed), top_k)
     top_passages = [f"Passage {i+1}:\n{doc_chunks[i]}" for i in I[0]]
     context = "\n\n".join(top_passages)
     prompt = (
-        "You are SecurityGPT, a cybersecurity assistant. Use the information below to answer thoroughly in multiple paragraphs.\n\n"
-        f"{context}\n\n"
         f"Question: {question}\n\nAnswer:"
     )
     input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=1024)
     output_ids = model.generate(
         input_ids,
-        max_length=600,
-        num_beams=5,
-        temperature=0.8,
-        repetition_penalty=1.15,
         early_stopping=True
     )
-    return tokenizer.decode(output_ids[0], skip_special_tokens=True)
 # -------------------------
-# Load Models
 # -------------------------
 embedder = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
-tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
-model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
 top_k = 5
 doc_chunks = []
 index = build_index()
 # -------------------------
-# Gradio App
 # -------------------------
 demo = gr.Interface(
     fn=generate_answer,
-    inputs=gr.Textbox(label="Ask SecurityGPT", placeholder="e.g. How do I secure my smart home devices?"),
     outputs=gr.Textbox(label="Answer"),
     title="🔒 SecurityGPT",
-    description="Ask cybersecurity questions using embedded content from your uploaded documents."
 )
 demo.launch()

 import numpy as np
 import faiss
 import zipfile
+import re
 from transformers import T5Tokenizer, T5ForConditionalGeneration
 from sentence_transformers import SentenceTransformer
 # -------------------------
+# Step 1: Unzip docs.zip if needed
 # -------------------------
 def unzip_docs():
     if os.path.exists("docs.zip") and not os.path.exists("docs"):
         print("✅ Extracted to /docs")
 # -------------------------
+# Step 2: Extract and log PDF text
 # -------------------------
 def load_docs(folder="docs"):
     all_text = ""
     found_files = []
     for root, _, files in os.walk(folder):
         for fname in files:
             if fname.lower().endswith(".pdf"):
     print(f"📄 Found {len(found_files)} PDF files:")
     for f in found_files:
         print(" -", f)
+    print(f"✅ Total raw text size: {len(all_text)} characters")
+    print(f"🧾 Sample Text:\n{all_text[:300]}")
     return all_text
 # -------------------------
+# Step 3: Chunk into paragraphs
 # -------------------------
 def chunk_text(text, max_words=200):
+    paragraphs = re.split(r'\n{2,}', text)
     chunks, current = [], ""
+    for para in paragraphs:
         if len((current + " " + para).split()) < max_words:
             current += " " + para
         else:
             current = para
     if current:
         chunks.append(current.strip())
+    print(f"✅ Total Chunks Created: {len(chunks)}")
+    for i, c in enumerate(chunks[:3]):
+        print(f"🔹 Chunk {i+1} Preview:\n{c[:250]}\n")
     return [c for c in chunks if len(c.split()) > 20]
 # -------------------------
+# Step 4: Build FAISS Index
 # -------------------------
 def build_index():
     unzip_docs()
     raw = load_docs("docs")
     global doc_chunks
     doc_chunks = chunk_text(raw)
     embeddings = embedder.encode(doc_chunks, convert_to_numpy=True, normalize_embeddings=True)
+    index = faiss.IndexFlatIP(embeddings.shape[1])
+    index.add(embeddings)
+    return index
 # -------------------------
+# Step 5: Generate Answer
 # -------------------------
 def generate_answer(question):
     q_embed = embedder.encode([question], normalize_embeddings=True)
     D, I = index.search(np.array(q_embed), top_k)
+    print("🔍 Top similarity scores:", D[0])
+    print("🧠 Retrieved indices:", I[0])
     top_passages = [f"Passage {i+1}:\n{doc_chunks[i]}" for i in I[0]]
     context = "\n\n".join(top_passages)
     prompt = (
+        "You are SecurityGPT, a cybersecurity expert assistant. Use ONLY the context below to answer the question clearly in multiple paragraphs.\n\n"
+        f"Context:\n{context}\n\n"
         f"Question: {question}\n\nAnswer:"
     )
     input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=1024)
     output_ids = model.generate(
         input_ids,
+        max_length=400,
+        num_beams=4,
+        temperature=0.7,
+        repetition_penalty=1.2,
         early_stopping=True
     )
+    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return answer
 # -------------------------
+# Step 6: Load Model & Index
 # -------------------------
 embedder = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")  # Faster
+model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
 top_k = 5
 doc_chunks = []
 index = build_index()
 # -------------------------
+# Step 7: Launch Gradio App
 # -------------------------
 demo = gr.Interface(
     fn=generate_answer,
+    inputs=gr.Textbox(label="Ask SecurityGPT", placeholder="e.g. How do I protect against phishing?"),
     outputs=gr.Textbox(label="Answer"),
     title="🔒 SecurityGPT",
+    description="Ask cybersecurity questions based on embedded content from your PDF documents.",
+    examples=[
+        "How can I secure my home network?",
+        "What are best practices for using public Wi-Fi?",
+        "What should I know about password managers?"
+    ]
 )
 demo.launch()