Aaseem commited on
Commit
06f7c7b
·
verified ·
1 Parent(s): 87554d6

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +603 -0
  2. requirements.txt +17 -0
app.py ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ nltk.download('punkt_tab')
3
+ nltk.download('stopwords')
4
+
5
+
6
+ import os
7
+ import gradio as gr
8
+ from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
9
+ from sentence_transformers import SentenceTransformer
10
+ from sklearn.cluster import DBSCAN
11
+ from sklearn.decomposition import PCA
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ import igraph as ig
14
+ import matplotlib.pyplot as plt
15
+ import pandas as pd
16
+ import numpy as np
17
+ import warnings
18
+ import nltk
19
+ import re
20
+ import spacy
21
+ from spacy import displacy
22
+ from bertopic import BERTopic
23
+ from datetime import datetime
24
+ import string
25
+ from nltk.corpus import stopwords
26
+ import pdfplumber
27
+ import os
28
+ import asyncio
29
+ from pyvis.network import Network
30
+ from langchain_core.documents import Document
31
+ from langchain_experimental.graph_transformers import LLMGraphTransformer
32
+ from langchain_openai import ChatOpenAI
33
+
34
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
35
+ os.environ["HF_HUB_DISABLE_XET_BACKEND"] = "1"
36
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
37
+ warnings.filterwarnings("ignore", category=FutureWarning)
38
+ warnings.filterwarnings("ignore", category=UserWarning)
39
+
40
+ # --- GLOBAL MODEL/PIPELINE INITIALIZATION ---
41
+ llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
42
+ graph_transformer = LLMGraphTransformer(llm=llm)
43
+ global_text_data = ""
44
+
45
+ # 1. NER Model
46
+ MODEL_NAME = "CyberPeace-Institute/SecureBERT-NER"
47
+ NER_MODEL_INITIALIZED = False
48
+ ner_tokenizer = None
49
+ ner_pipeline = None
50
+
51
+ try:
52
+ print("Attempting to load SecureBERT-NER Model...")
53
+ ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
54
+ ner_model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
55
+ ner_pipeline = pipeline(
56
+ "token-classification",
57
+ model=ner_model,
58
+ tokenizer=ner_tokenizer,
59
+ aggregation_strategy="simple"
60
+ )
61
+ print("NER Model loaded successfully.")
62
+ NER_MODEL_INITIALIZED = True
63
+ except Exception as e:
64
+ print(f"CRITICAL ERROR: Failed to load NER model. Knowledge Graph functionality will be disabled.")
65
+ print(f"Details: {e}")
66
+
67
+ # 2. Sentence Embedding Model for Clustering
68
+ try:
69
+ print("Attempting to load Sentence Transformer Model...")
70
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
71
+ print("Sentence Transformer Model loaded successfully.")
72
+ except Exception as e:
73
+ print(f"CRITICAL ERROR: Failed to load Sentence Transformer model. Clustering functionality will be disabled.")
74
+ print(f"Details: {e}")
75
+
76
+ # 3. NLTK Tokenizer for Sentence Splitting
77
+ try:
78
+ nltk.data.find('tokenizers/punkt')
79
+ except LookupError:
80
+ print("Downloading NLTK 'punkt' model...")
81
+ nltk.download('punkt')
82
+
83
+ # 4. spaCy Model for Linguistic Analysis
84
+ try:
85
+ print("Attempting to load spaCy Model...")
86
+ nlp = spacy.load("en_core_web_sm")
87
+ print("spaCy Model loaded successfully.")
88
+ except Exception as e:
89
+ print(f"CRITICAL ERROR: Failed to load spaCy model: {e}")
90
+
91
+ # 5. Sentiment Analysis Model
92
+ sentiment_pipeline = None
93
+ try:
94
+ print("Attempting to load Sentiment Model...")
95
+ sentiment_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
96
+ sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model_name)
97
+ print("Sentiment Pipeline loaded successfully.")
98
+ except Exception as e:
99
+ print(f"CRITICAL ERROR: Failed to load Sentiment pipeline: {e}")
100
+
101
+ # --- CORE UTILITY FUNCTIONS ---
102
+ def extract_pdf_text(pdf_path):
103
+ try:
104
+ text = ""
105
+ with pdfplumber.open(pdf_path) as pdf:
106
+ for i, page in enumerate(pdf.pages):
107
+ page_text = page.extract_text()
108
+ if page_text:
109
+ page_text = page_text.replace("\xa0", " ").strip()
110
+ text += page_text + "\n\n"
111
+ if not text.strip():
112
+ return "Error: No extractable text found in this PDF (it may be scanned or image-based)."
113
+ return text
114
+ except Exception as e:
115
+ return f"Error reading PDF file with pdfplumber: {type(e).__name__}: {str(e)}"
116
+
117
+
118
+ def chunk_text(text, max_length=512, overlap=50):
119
+ if not NER_MODEL_INITIALIZED: return ["Model not loaded."]
120
+ tokens = ner_tokenizer.encode(text, add_special_tokens=False)
121
+ chunks = [ner_tokenizer.decode(tokens[i:i + max_length]) for i in range(0, len(tokens), max_length - overlap)]
122
+ return chunks
123
+
124
+ def clean_and_split_sentences(text):
125
+ sentences = nltk.sent_tokenize(text)
126
+
127
+ clean_sentences = []
128
+ for sentence in sentences:
129
+ sentence = re.sub(r'\s+', ' ', sentence).strip()
130
+ word_count = len(sentence.split())
131
+ if word_count < 4 or word_count > 256:
132
+ continue
133
+ if not re.search(r'[a-zA-Z]{3,}', sentence):
134
+ continue
135
+ if sentence.lower().startswith(("figure ", "table ", "page ", "©", "appendix ")):
136
+ continue
137
+ clean_sentences.append(sentence)
138
+
139
+ return clean_sentences
140
+
141
+ def remove_punc_fast(text):
142
+ exclude = string.punctuation
143
+ return text.translate(str.maketrans('', '', exclude))
144
+
145
+ def remove_stopwords(text):
146
+ english_stopwords = stopwords.words('english')
147
+ new_text = []
148
+
149
+ for word in text.split():
150
+ if word in english_stopwords:
151
+ new_text.append('')
152
+ else:
153
+ new_text.append(word)
154
+ return " ".join(new_text)
155
+
156
+ def clean_entity_names(entity_names):
157
+ cleaned_words = []
158
+ for word in entity_names:
159
+ cleaned = re.sub(r'[^a-zA-Z\s]', '', word)
160
+ cleaned = cleaned.strip()
161
+ if cleaned:
162
+ cleaned_words.append(cleaned)
163
+ return cleaned_words
164
+
165
+ def preprocess_text(text):
166
+ text = text.lower()
167
+ text = remove_punc_fast(text)
168
+ text = remove_stopwords(text)
169
+ return text
170
+
171
+ async def extract_graph_data_async(text):
172
+ documents = [Document(page_content=text)]
173
+ graph_documents = await graph_transformer.aconvert_to_graph_documents(documents)
174
+ return graph_documents
175
+
176
+ def visualize_graph(graph_documents, output_file="llm_knowledge_graph.html"):
177
+ net = Network(height="800px", width="100%", directed=True,
178
+ notebook=False, bgcolor="#222222", font_color="white", filter_menu=True, cdn_resources='remote')
179
+
180
+ if not graph_documents or not graph_documents[0].nodes:
181
+ net.save_graph(output_file)
182
+ return output_file
183
+
184
+ nodes = graph_documents[0].nodes
185
+ relationships = graph_documents[0].relationships
186
+ node_dict = {node.id: node for node in nodes}
187
+
188
+ valid_edges = [rel for rel in relationships if rel.source.id in node_dict and rel.target.id in node_dict]
189
+ valid_node_ids = set([rel.source.id for rel in valid_edges] + [rel.target.id for rel in valid_edges])
190
+
191
+ for node_id in valid_node_ids:
192
+ node = node_dict[node_id]
193
+ net.add_node(node.id, label=node.id, title=node.type, group=node.type)
194
+
195
+ for rel in valid_edges:
196
+ net.add_edge(rel.source.id, rel.target.id, label=rel.type.lower())
197
+
198
+ net.set_options("""
199
+ {
200
+ "physics": {
201
+ "forceAtlas2Based": {
202
+ "gravitationalConstant": -100,
203
+ "centralGravity": 0.01,
204
+ "springLength": 200,
205
+ "springConstant": 0.08,
206
+ "avoidOverlap": 0.5
207
+ },
208
+ "minVelocity": 0.75,
209
+ "solver": "forceAtlas2Based"
210
+ }
211
+ }
212
+ """)
213
+ net.save_graph(output_file)
214
+ return output_file
215
+
216
+ def generate_llm_kg(pdf_file):
217
+ global global_text_data
218
+
219
+ if pdf_file is None:
220
+ return None, "Please upload a PDF file."
221
+ text = extract_pdf_text(pdf_file.name)
222
+
223
+ try:
224
+ graph_documents = asyncio.run(extract_graph_data_async(text))
225
+ unique_file_name = "llm_knowledge_graph.html"
226
+ html_file_path = visualize_graph(graph_documents, output_file=unique_file_name)
227
+ global_text_data = text # Store extracted text for QnA
228
+ return html_file_path, "LLM Knowledge Graph generated successfully! You can now ask questions in the QnA tab."
229
+ except Exception as e:
230
+ return None, f"Error generating LLM knowledge graph: {e}"
231
+
232
+ def answer_from_graph(query):
233
+ global global_text_data
234
+ if not global_text_data:
235
+ return "Please generate a Knowledge Graph first by uploading a PDF."
236
+
237
+ prompt = f"""
238
+ You are a helpful assistant. Use the following extracted content from a PDF to answer questions concisely.
239
+ Content:
240
+ {global_text_data}
241
+ Question: {query}
242
+ Answer:
243
+ """
244
+ try:
245
+ response = llm.invoke(prompt)
246
+ return response.content.strip()
247
+ except Exception as e:
248
+ return f"Error generating answer: {e}"
249
+
250
+
251
+ def batch_sentiment_analysis(sentences):
252
+ """
253
+ Analyzes a list of sentences in a fast batch.
254
+ """
255
+ if not sentences:
256
+ return pd.DataFrame(columns=["Label", "Score", "Sentence"]), "No sentences to analyze."
257
+ if sentiment_pipeline is None:
258
+ return pd.DataFrame(), "Sentiment pipeline not loaded."
259
+
260
+ try:
261
+ results = sentiment_pipeline(sentences, truncation=True)
262
+ df = pd.DataFrame(results)
263
+ valid_sentences = sentences[:len(df)]
264
+ df['Sentence'] = valid_sentences
265
+ df['Score'] = df['score'].round(3)
266
+ df['Label'] = df['label']
267
+ positive_df = df[df['Label'] == 'POSITIVE'].nlargest(5, 'Score')
268
+ negative_df = df[df['Label'] == 'NEGATIVE'].nlargest(5, 'Score')
269
+ summary_df = pd.concat([positive_df, negative_df]).sort_values('Score', ascending=False)
270
+ return summary_df[['Label', 'Score', 'Sentence']], f"Analyzed {len(sentences)} sentences."
271
+ except Exception as e:
272
+ return pd.DataFrame(), f"Error during sentiment analysis: {e}"
273
+
274
+
275
+ def batch_cti_classification(sentences):
276
+ if not sentences:
277
+ return pd.DataFrame(columns=["CTI Topic", "Mentions", "Example Sentence"]), "No sentences to analyze."
278
+
279
+ keywords = {
280
+ "Phishing": ["phishing", "vishing", "smishing"],
281
+ "Malware": ["malware", "ransomware", "trojan", "keylogger", "emotet"],
282
+ "Vulnerability": ["cve-", "vulnerability", "zero-day"],
283
+ "Attack": ["attack", "breach", "incident", "apt-", "ddos"],
284
+ "Exploit": ["exploit", "exploited", "rce", "remote code execution"],
285
+ }
286
+ topic_summary = {topic: {"count": 0, "example": ""} for topic in keywords}
287
+
288
+ for sentence in sentences:
289
+ sentence_lower = sentence.lower()
290
+ found_in_sentence = set()
291
+ for topic, words in keywords.items():
292
+ for word in words:
293
+ if word in sentence_lower:
294
+ if topic not in found_in_sentence:
295
+ topic_summary[topic]["count"] += 1
296
+ if not topic_summary[topic]["example"]:
297
+ topic_summary[topic]["example"] = sentence
298
+ found_in_sentence.add(topic)
299
+
300
+ summary_list = []
301
+ for topic, data in topic_summary.items():
302
+ if data["count"] > 0:
303
+ summary_list.append({
304
+ "CTI Topic": topic,
305
+ "Mentions": data["count"],
306
+ "Example Sentence": data["example"]
307
+ })
308
+
309
+ if not summary_list:
310
+ return pd.DataFrame([{"CTI Topic": "No CTI Keywords Found", "Mentions": 0, "Example Sentence": ""}]), "No CTI keywords found in document."
311
+
312
+ summary_df = pd.DataFrame(summary_list).sort_values("Mentions", ascending=False)
313
+ return summary_df, f"Scanned {len(sentences)} sentences for CTI terms."
314
+
315
+
316
+
317
+ def get_cluster_topic_names(sentences, cluster_assignments):
318
+ clustered_sentences = {i: [] for i in set(cluster_assignments)}
319
+ for sentence, cluster_id in zip(sentences, cluster_assignments):
320
+ clustered_sentences[cluster_id].append(sentence)
321
+ topic_names = {}
322
+ for cluster_id, docs in clustered_sentences.items():
323
+ if cluster_id == -1:
324
+ topic_names[cluster_id] = "Outliers / Miscellaneous"
325
+ continue
326
+ try:
327
+ vectorizer = TfidfVectorizer(stop_words='english', max_features=3, ngram_range=(1, 2))
328
+ corpus = [" ".join(docs)]
329
+ vectorizer.fit(corpus)
330
+ feature_names = vectorizer.get_feature_names_out()
331
+ topic_names[cluster_id] = ", ".join(feature_names)
332
+ except ValueError:
333
+ topic_names[cluster_id] = "Short / Common Phrases"
334
+ return topic_names
335
+
336
+ def perform_clustering(sentences):
337
+ if not sentences:
338
+ return None, None, None, "No sentences to cluster."
339
+ embeddings = embedding_model.encode(sentences)
340
+ dbscan = DBSCAN(eps=1.0, min_samples=2)
341
+ dbscan.fit(embeddings)
342
+ cluster_assignments = dbscan.labels_
343
+ topic_names = get_cluster_topic_names(sentences, cluster_assignments)
344
+ return embeddings, cluster_assignments, topic_names, f"Successfully clustered {len(sentences)} sentences."
345
+
346
+ def create_cluster_plot(embeddings, cluster_assignments, topic_names):
347
+ if embeddings is None:
348
+ return None
349
+ pca = PCA(n_components=2)
350
+ reduced_embeddings = pca.fit_transform(embeddings)
351
+ fig, ax = plt.subplots(figsize=(12, 10))
352
+ unique_labels = sorted(set(cluster_assignments))
353
+ colors = [plt.cm.viridis(each) for each in np.linspace(0, 1, len(unique_labels))]
354
+ for k, col in zip(unique_labels, colors):
355
+ label = topic_names.get(k, "Unknown")
356
+ if k == -1: col = [0, 0, 0, 1]
357
+ class_member_mask = (cluster_assignments == k)
358
+ xy = reduced_embeddings[class_member_mask]
359
+ ax.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
360
+ markeredgecolor='k', markersize=14 if k != -1 else 7, label=label)
361
+ ax.set_title("Semantic Topic Clusters from PDF Document")
362
+ ax.legend(title="Topics")
363
+ return fig
364
+
365
+ def show_cluster_sentences(selected_topic, topics_dict, assignments_list, sentences_list):
366
+ if not selected_topic:
367
+ return pd.DataFrame(columns=["Sentences"]), "Select a topic to see sample sentences."
368
+ try:
369
+ cluster_id = [key for key, value in topics_dict.items() if value == selected_topic][0]
370
+ matching_sentences = []
371
+ for sentence, assignment in zip(sentences_list, assignments_list):
372
+ if assignment == cluster_id:
373
+ matching_sentences.append(sentence)
374
+ df = pd.DataFrame(matching_sentences, columns=["Sentences"])
375
+ status = f"Showing {len(matching_sentences)} sentences for topic: '{selected_topic}'"
376
+ return df, status
377
+ except Exception as e:
378
+ return pd.DataFrame(), f"Error finding sentences: {e}"
379
+
380
+ def run_bertopic_modeling(sentences):
381
+ if not sentences:
382
+ return None, None, "No sentences to model. Please process a report first."
383
+
384
+ try:
385
+ print("Starting BERTopic modeling...")
386
+ topic_model = BERTopic(verbose=False, min_topic_size=6, embedding_model=embedding_model)
387
+ topics, probs = topic_model.fit_transform(sentences)
388
+
389
+ # Get topic info for a table
390
+ topic_info = topic_model.get_topic_info()
391
+
392
+ # Get the barchart
393
+ fig = topic_model.visualize_barchart(top_n_topics=10)
394
+
395
+ print("BERTopic modeling complete.")
396
+ return fig, topic_info, "BERTopic analysis complete."
397
+
398
+ except Exception as e:
399
+ return None, None, f"Error during BERTopic analysis: {e}"
400
+
401
+
402
+
403
+
404
+ def linguistic_analysis_spacy(text):
405
+ if not text or not text.strip():
406
+ return [], "<p>Please enter text for analysis.</p>"
407
+
408
+ doc = nlp(text) # Uses the global nlp model
409
+ pos_tags = [(t.text, t.pos_, t.dep_) for t in doc]
410
+
411
+ # Generate the raw SVG from displacy, ensuring text is dark
412
+ options = {'distance': 110,'compact': 'True','color':'#fff','bg':'#00a65a',"font": "sans-serif"}
413
+ svg = displacy.render(doc, style="dep", jupyter=False, options=options)
414
+
415
+
416
+ html_wrapper = f"""
417
+ <div style="background-color: white; border: 1px solid #E5E7EB; border-radius: 8px; padding: 12px; overflow-x: auto;">
418
+ {svg}
419
+ </div>
420
+ """
421
+
422
+ return pos_tags, html_wrapper
423
+
424
+
425
+
426
+ # --- GRADIO WORKFLOW FUNCTIONS ---
427
+
428
+ def unified_process_report(file_obj):
429
+ if file_obj is None:
430
+ return "Please upload a PDF file.", []
431
+
432
+ if not NER_MODEL_INITIALIZED:
433
+ return "CRITICAL: NER Model failed to load.", []
434
+
435
+ text = extract_pdf_text(file_obj.name)
436
+ if text.startswith("Error"):
437
+ return text, []
438
+
439
+ sentences = clean_and_split_sentences(text)
440
+
441
+ preprocessed_sentences_for_state = [preprocess_text(s) for s in sentences]
442
+
443
+ status = f"Processed {len(sentences)} clean sentences successfully."
444
+
445
+ try:
446
+ html_file_path, kg_status = generate_llm_kg(file_obj)
447
+ combined_status = f"\n{kg_status}"
448
+ except Exception as e:
449
+ html_file_path, combined_status = None, f"{status}\nError generating Knowledge Graph: {e}"
450
+
451
+ return status, preprocessed_sentences_for_state, html_file_path, combined_status
452
+
453
+
454
+ def run_clustering_workflow(sentences):
455
+ embeddings, labels, topics, status = perform_clustering(sentences)
456
+ plot = create_cluster_plot(embeddings, labels, topics)
457
+ topic_name_list = list(topics.values())
458
+ sentence_df = pd.DataFrame(sentences, columns=["Sentences"])
459
+ return plot, status, labels, topics, gr.Dropdown(choices=topic_name_list), sentence_df
460
+
461
+ def run_batch_analysis(sentences):
462
+ cti_df, cti_status = batch_cti_classification(sentences)
463
+ sent_df, sent_status = batch_sentiment_analysis(sentences)
464
+ full_status = f"CTI: {cti_status} | Sentiment: {sent_status}"
465
+ return cti_df, sent_df, full_status
466
+
467
+ def on_click(pdf_file):
468
+ html_file_path, status = generate_llm_kg(pdf_file)
469
+ return html_file_path, status
470
+
471
+ # --- GRADIO INTERFACE LAYOUT ---
472
+
473
+ with gr.Blocks(title="CTI Analysis Tool", theme=gr.themes.Soft()) as app:
474
+ gr.Markdown("# Cyber Threat Intelligence (CTI) Analysis Tool")
475
+ gr.Markdown("Upload a CTI report (PDF) to analyze entities and semantic topics.")
476
+
477
+ # --- State Variables ---
478
+ sentences_state = gr.State([])
479
+ cluster_assignments_state = gr.State([])
480
+ cluster_topics_state = gr.State({})
481
+
482
+ # --- Main Upload Row ---
483
+ with gr.Row():
484
+ file_input = gr.File(label="Upload CTI Report (PDF)", file_types=[".pdf"])
485
+ process_button = gr.Button("Process Report", variant="primary")
486
+ status_output = gr.Textbox(label="Processing Status", interactive=False)
487
+
488
+ # --- Tabs ---
489
+ with gr.Tabs():
490
+ with gr.TabItem("Knowledge Graph Analyzer"):
491
+ gr.Markdown("### Knowledge Graph")
492
+ llm_status = gr.Textbox(label="Status", interactive=False)
493
+ llm_graph_output_file = gr.File(label="Knowledge Graph HTML File", file_types=[".html"], interactive=False)
494
+
495
+
496
+ with gr.TabItem("Knowledge Graph QnA"):
497
+ gr.Markdown("### Ask Questions About the Knowledge Graph")
498
+ user_query = gr.Textbox(label="Enter your question", placeholder="e.g., Which malware communicates with example.com?")
499
+ ask_button = gr.Button("Get Answer")
500
+ answer_box = gr.Textbox(label="Answer", lines=5, interactive=False)
501
+
502
+
503
+ with gr.TabItem("Semantic Topic Clustering"):
504
+ gr.Markdown("### Group Sentences by Semantic Meaning (DBSCAN)")
505
+ cluster_button = gr.Button("1. Cluster PDF Sentences", variant="secondary")
506
+ cluster_status = gr.Textbox(label="Clustering Status", interactive=False)
507
+ gr.Markdown("#### Sentences Used for Clustering")
508
+ input_sentence_df = gr.DataFrame(headers=["Sentences"], label="Input Sentences", interactive=False, row_count=10)
509
+ gr.Markdown("#### Cluster Visualization")
510
+ cluster_plot_output = gr.Plot(label="Sentence Cluster Visualization")
511
+ gr.Markdown("### Explore Clusters")
512
+ with gr.Row():
513
+ topic_dropdown = gr.Dropdown(label="Select Topic", choices=[], interactive=True, scale=3)
514
+ cluster_sentence_df = gr.DataFrame(headers=["Sentences"], label="Sentences in Selected Cluster", interactive=False, scale=4, row_count=10)
515
+
516
+ with gr.TabItem("Document Summary"):
517
+ gr.Markdown("### Sentiment & CTI Summary")
518
+ analyze_pdf_button = gr.Button("Analyze PDF Sentences", variant="primary")
519
+ summary_status = gr.Textbox(label="Analysis Status", interactive=False)
520
+ gr.Markdown("#### CTI Keyword Summary")
521
+ cti_summary_output = gr.DataFrame(headers=["CTI Topic", "Mentions", "Example Sentence"], label="CTI Summary")
522
+ gr.Markdown("#### Sentiment Analysis")
523
+ sentiment_summary_output = gr.DataFrame(headers=["Label", "Score", "Sentence"], label="Sentiment Highlights", row_count=10)
524
+
525
+ # --- NEW: BERTopic Tab ---
526
+ with gr.TabItem("Topic Modeling (BERTopic)"):
527
+ gr.Markdown("### Advanced Topic Modeling with BERTopic")
528
+ gr.Markdown("Run BERTopic on the full list of cleaned sentences to discover themes.")
529
+ bertopic_button = gr.Button("Run Topic Model", variant="secondary")
530
+ bertopic_status = gr.Textbox(label="BERTopic Status", interactive=False)
531
+ gr.Markdown("#### Top 10 Discovered Topics")
532
+ bertopic_plot = gr.Plot(label="BERTopic Barchart")
533
+ gr.Markdown("#### All Discovered Topics")
534
+ bertopic_df = gr.DataFrame(label="BERTopic Topic List")
535
+
536
+ # --- NEW: Linguistic Analysis Tab ---
537
+ with gr.TabItem("Linguistic Analysis (spaCy)"):
538
+ gr.Markdown("### POS Tagging & Dependency Parsing")
539
+ gr.Markdown("Analyze the grammatical structure of a single sentence.")
540
+ ling_input = gr.Textbox(label="Enter a sentence to analyze", lines=3, placeholder="e.g., Copy a sentence from the cluster results...")
541
+ ling_button = gr.Button("Analyze Syntax")
542
+ gr.Markdown("#### Part-of-Speech (POS) Tags")
543
+ ling_pos_df = gr.DataFrame(headers=["Token", "POS", "Dependency"], label="POS Tags", row_count=10)
544
+ gr.Markdown("#### Dependency Plot")
545
+ ling_dep_html = gr.HTML(label="Dependency Visualization")
546
+
547
+ # --- EVENT HANDLERS ---
548
+ process_button.click(
549
+ fn=unified_process_report,
550
+ inputs=[file_input],
551
+ outputs=[status_output, sentences_state, llm_graph_output_file, llm_status]
552
+ )
553
+
554
+ cluster_button.click(
555
+ fn=run_clustering_workflow,
556
+ inputs=[sentences_state],
557
+ outputs=[
558
+ cluster_plot_output,
559
+ cluster_status,
560
+ cluster_assignments_state,
561
+ cluster_topics_state,
562
+ topic_dropdown,
563
+ input_sentence_df
564
+ ]
565
+ )
566
+
567
+ topic_dropdown.select(
568
+ fn=show_cluster_sentences,
569
+ inputs=[
570
+ topic_dropdown,
571
+ cluster_topics_state,
572
+ cluster_assignments_state,
573
+ sentences_state
574
+ ],
575
+ outputs=[cluster_sentence_df, cluster_status]
576
+ )
577
+
578
+ analyze_pdf_button.click(
579
+ fn=run_batch_analysis,
580
+ inputs=sentences_state,
581
+ outputs=[cti_summary_output, sentiment_summary_output, summary_status]
582
+ )
583
+
584
+ bertopic_button.click(
585
+ fn=run_bertopic_modeling,
586
+ inputs=[sentences_state],
587
+ outputs=[bertopic_plot, bertopic_df, bertopic_status]
588
+ )
589
+
590
+ ling_button.click(
591
+ fn=linguistic_analysis_spacy,
592
+ inputs=[ling_input],
593
+ outputs=[ling_pos_df, ling_dep_html]
594
+ )
595
+
596
+ ask_button.click(
597
+ fn=answer_from_graph,
598
+ inputs=[user_query],
599
+ outputs=[answer_box]
600
+ )
601
+
602
+
603
+ app.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ sentence-transformers
4
+ scikit-learn
5
+ pandas
6
+ numpy
7
+ matplotlib
8
+ nltk
9
+ spacy
10
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
11
+ pdfplumber
12
+ pyvis
13
+ langchain
14
+ langchain-openai
15
+ langchain-experimental
16
+ bertopic
17
+ protobuf