Spaces:
Sleeping
Sleeping
| import nltk | |
| nltk.download('punkt_tab') | |
| nltk.download('stopwords') | |
| import os | |
| import gradio as gr | |
| from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.cluster import DBSCAN | |
| from sklearn.decomposition import PCA | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| import igraph as ig | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| import numpy as np | |
| import warnings | |
| import nltk | |
| import re | |
| import spacy | |
| from spacy import displacy | |
| from bertopic import BERTopic | |
| from datetime import datetime | |
| import string | |
| from nltk.corpus import stopwords | |
| import pdfplumber | |
| import os | |
| import asyncio | |
| from pyvis.network import Network | |
| from langchain_core.documents import Document | |
| from langchain_experimental.graph_transformers import LLMGraphTransformer | |
| from langchain_openai import ChatOpenAI | |
| os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") | |
| os.environ["HF_HUB_DISABLE_XET_BACKEND"] = "1" | |
| os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" | |
| warnings.filterwarnings("ignore", category=FutureWarning) | |
| warnings.filterwarnings("ignore", category=UserWarning) | |
| # --- GLOBAL MODEL/PIPELINE INITIALIZATION --- | |
| llm = ChatOpenAI(temperature=0, model_name="gpt-4o") | |
| graph_transformer = LLMGraphTransformer(llm=llm) | |
| global_text_data = "" | |
| # 1. NER Model | |
| MODEL_NAME = "CyberPeace-Institute/SecureBERT-NER" | |
| NER_MODEL_INITIALIZED = False | |
| ner_tokenizer = None | |
| ner_pipeline = None | |
| try: | |
| print("Attempting to load SecureBERT-NER Model...") | |
| ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| ner_model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME) | |
| ner_pipeline = pipeline( | |
| "token-classification", | |
| model=ner_model, | |
| tokenizer=ner_tokenizer, | |
| aggregation_strategy="simple" | |
| ) | |
| print("NER Model loaded successfully.") | |
| NER_MODEL_INITIALIZED = True | |
| except Exception as e: | |
| print(f"CRITICAL ERROR: Failed to load NER model. Knowledge Graph functionality will be disabled.") | |
| print(f"Details: {e}") | |
| # 2. Sentence Embedding Model for Clustering | |
| try: | |
| print("Attempting to load Sentence Transformer Model...") | |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') | |
| print("Sentence Transformer Model loaded successfully.") | |
| except Exception as e: | |
| print(f"CRITICAL ERROR: Failed to load Sentence Transformer model. Clustering functionality will be disabled.") | |
| print(f"Details: {e}") | |
| # 3. NLTK Tokenizer for Sentence Splitting | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| print("Downloading NLTK 'punkt' model...") | |
| nltk.download('punkt') | |
| # 4. spaCy Model for Linguistic Analysis | |
| try: | |
| print("Attempting to load spaCy Model...") | |
| nlp = spacy.load("en_core_web_sm") | |
| print("spaCy Model loaded successfully.") | |
| except Exception as e: | |
| print(f"CRITICAL ERROR: Failed to load spaCy model: {e}") | |
| # 5. Sentiment Analysis Model | |
| sentiment_pipeline = None | |
| try: | |
| print("Attempting to load Sentiment Model...") | |
| sentiment_model_name = "distilbert-base-uncased-finetuned-sst-2-english" | |
| sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model_name) | |
| print("Sentiment Pipeline loaded successfully.") | |
| except Exception as e: | |
| print(f"CRITICAL ERROR: Failed to load Sentiment pipeline: {e}") | |
| # --- CORE UTILITY FUNCTIONS --- | |
| def extract_pdf_text(pdf_path): | |
| try: | |
| text = "" | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for i, page in enumerate(pdf.pages): | |
| page_text = page.extract_text() | |
| if page_text: | |
| page_text = page_text.replace("\xa0", " ").strip() | |
| text += page_text + "\n\n" | |
| if not text.strip(): | |
| return "Error: No extractable text found in this PDF (it may be scanned or image-based)." | |
| return text | |
| except Exception as e: | |
| return f"Error reading PDF file with pdfplumber: {type(e).__name__}: {str(e)}" | |
| def chunk_text(text, max_length=512, overlap=50): | |
| if not NER_MODEL_INITIALIZED: return ["Model not loaded."] | |
| tokens = ner_tokenizer.encode(text, add_special_tokens=False) | |
| chunks = [ner_tokenizer.decode(tokens[i:i + max_length]) for i in range(0, len(tokens), max_length - overlap)] | |
| return chunks | |
| def clean_and_split_sentences(text): | |
| sentences = nltk.sent_tokenize(text) | |
| clean_sentences = [] | |
| for sentence in sentences: | |
| sentence = re.sub(r'\s+', ' ', sentence).strip() | |
| word_count = len(sentence.split()) | |
| if word_count < 4 or word_count > 256: | |
| continue | |
| if not re.search(r'[a-zA-Z]{3,}', sentence): | |
| continue | |
| if sentence.lower().startswith(("figure ", "table ", "page ", "©", "appendix ")): | |
| continue | |
| clean_sentences.append(sentence) | |
| return clean_sentences | |
| def remove_punc_fast(text): | |
| exclude = string.punctuation | |
| return text.translate(str.maketrans('', '', exclude)) | |
| def remove_stopwords(text): | |
| english_stopwords = stopwords.words('english') | |
| new_text = [] | |
| for word in text.split(): | |
| if word in english_stopwords: | |
| new_text.append('') | |
| else: | |
| new_text.append(word) | |
| return " ".join(new_text) | |
| def clean_entity_names(entity_names): | |
| cleaned_words = [] | |
| for word in entity_names: | |
| cleaned = re.sub(r'[^a-zA-Z\s]', '', word) | |
| cleaned = cleaned.strip() | |
| if cleaned: | |
| cleaned_words.append(cleaned) | |
| return cleaned_words | |
| def preprocess_text(text): | |
| text = text.lower() | |
| text = remove_punc_fast(text) | |
| text = remove_stopwords(text) | |
| return text | |
| async def extract_graph_data_async(text): | |
| documents = [Document(page_content=text)] | |
| graph_documents = await graph_transformer.aconvert_to_graph_documents(documents) | |
| return graph_documents | |
| def visualize_graph(graph_documents, output_file="llm_knowledge_graph.html"): | |
| net = Network(height="800px", width="100%", directed=True, | |
| notebook=False, bgcolor="#222222", font_color="white", filter_menu=True, cdn_resources='remote') | |
| if not graph_documents or not graph_documents[0].nodes: | |
| net.save_graph(output_file) | |
| return output_file | |
| nodes = graph_documents[0].nodes | |
| relationships = graph_documents[0].relationships | |
| node_dict = {node.id: node for node in nodes} | |
| valid_edges = [rel for rel in relationships if rel.source.id in node_dict and rel.target.id in node_dict] | |
| valid_node_ids = set([rel.source.id for rel in valid_edges] + [rel.target.id for rel in valid_edges]) | |
| for node_id in valid_node_ids: | |
| node = node_dict[node_id] | |
| net.add_node(node.id, label=node.id, title=node.type, group=node.type) | |
| for rel in valid_edges: | |
| net.add_edge(rel.source.id, rel.target.id, label=rel.type.lower()) | |
| net.set_options(""" | |
| { | |
| "physics": { | |
| "forceAtlas2Based": { | |
| "gravitationalConstant": -100, | |
| "centralGravity": 0.01, | |
| "springLength": 200, | |
| "springConstant": 0.08, | |
| "avoidOverlap": 0.5 | |
| }, | |
| "minVelocity": 0.75, | |
| "solver": "forceAtlas2Based" | |
| } | |
| } | |
| """) | |
| net.save_graph(output_file) | |
| return output_file | |
| def generate_llm_kg(pdf_file): | |
| global global_text_data | |
| if pdf_file is None: | |
| return None, "Please upload a PDF file." | |
| text = extract_pdf_text(pdf_file.name) | |
| try: | |
| graph_documents = asyncio.run(extract_graph_data_async(text)) | |
| unique_file_name = "llm_knowledge_graph.html" | |
| html_file_path = visualize_graph(graph_documents, output_file=unique_file_name) | |
| global_text_data = text # Store extracted text for QnA | |
| return html_file_path, "LLM Knowledge Graph generated successfully! You can now ask questions in the QnA tab." | |
| except Exception as e: | |
| return None, f"Error generating LLM knowledge graph: {e}" | |
| def answer_from_graph(query): | |
| global global_text_data | |
| if not global_text_data: | |
| return "Please generate a Knowledge Graph first by uploading a PDF." | |
| prompt = f""" | |
| You are a helpful assistant. Use the following extracted content from a PDF to answer questions concisely. | |
| Content: | |
| {global_text_data} | |
| Question: {query} | |
| Answer: | |
| """ | |
| try: | |
| response = llm.invoke(prompt) | |
| return response.content.strip() | |
| except Exception as e: | |
| return f"Error generating answer: {e}" | |
| def batch_sentiment_analysis(sentences): | |
| """ | |
| Analyzes a list of sentences in a fast batch. | |
| """ | |
| if not sentences: | |
| return pd.DataFrame(columns=["Label", "Score", "Sentence"]), "No sentences to analyze." | |
| if sentiment_pipeline is None: | |
| return pd.DataFrame(), "Sentiment pipeline not loaded." | |
| try: | |
| results = sentiment_pipeline(sentences, truncation=True) | |
| df = pd.DataFrame(results) | |
| valid_sentences = sentences[:len(df)] | |
| df['Sentence'] = valid_sentences | |
| df['Score'] = df['score'].round(3) | |
| df['Label'] = df['label'] | |
| positive_df = df[df['Label'] == 'POSITIVE'].nlargest(5, 'Score') | |
| negative_df = df[df['Label'] == 'NEGATIVE'].nlargest(5, 'Score') | |
| summary_df = pd.concat([positive_df, negative_df]).sort_values('Score', ascending=False) | |
| return summary_df[['Label', 'Score', 'Sentence']], f"Analyzed {len(sentences)} sentences." | |
| except Exception as e: | |
| return pd.DataFrame(), f"Error during sentiment analysis: {e}" | |
| def batch_cti_classification(sentences): | |
| if not sentences: | |
| return pd.DataFrame(columns=["CTI Topic", "Mentions", "Example Sentence"]), "No sentences to analyze." | |
| keywords = { | |
| "Phishing": ["phishing", "vishing", "smishing"], | |
| "Malware": ["malware", "ransomware", "trojan", "keylogger", "emotet"], | |
| "Vulnerability": ["cve-", "vulnerability", "zero-day"], | |
| "Attack": ["attack", "breach", "incident", "apt-", "ddos"], | |
| "Exploit": ["exploit", "exploited", "rce", "remote code execution"], | |
| } | |
| topic_summary = {topic: {"count": 0, "example": ""} for topic in keywords} | |
| for sentence in sentences: | |
| sentence_lower = sentence.lower() | |
| found_in_sentence = set() | |
| for topic, words in keywords.items(): | |
| for word in words: | |
| if word in sentence_lower: | |
| if topic not in found_in_sentence: | |
| topic_summary[topic]["count"] += 1 | |
| if not topic_summary[topic]["example"]: | |
| topic_summary[topic]["example"] = sentence | |
| found_in_sentence.add(topic) | |
| summary_list = [] | |
| for topic, data in topic_summary.items(): | |
| if data["count"] > 0: | |
| summary_list.append({ | |
| "CTI Topic": topic, | |
| "Mentions": data["count"], | |
| "Example Sentence": data["example"] | |
| }) | |
| if not summary_list: | |
| return pd.DataFrame([{"CTI Topic": "No CTI Keywords Found", "Mentions": 0, "Example Sentence": ""}]), "No CTI keywords found in document." | |
| summary_df = pd.DataFrame(summary_list).sort_values("Mentions", ascending=False) | |
| return summary_df, f"Scanned {len(sentences)} sentences for CTI terms." | |
| def get_cluster_topic_names(sentences, cluster_assignments): | |
| clustered_sentences = {i: [] for i in set(cluster_assignments)} | |
| for sentence, cluster_id in zip(sentences, cluster_assignments): | |
| clustered_sentences[cluster_id].append(sentence) | |
| topic_names = {} | |
| for cluster_id, docs in clustered_sentences.items(): | |
| if cluster_id == -1: | |
| topic_names[cluster_id] = "Outliers / Miscellaneous" | |
| continue | |
| try: | |
| vectorizer = TfidfVectorizer(stop_words='english', max_features=3, ngram_range=(1, 2)) | |
| corpus = [" ".join(docs)] | |
| vectorizer.fit(corpus) | |
| feature_names = vectorizer.get_feature_names_out() | |
| topic_names[cluster_id] = ", ".join(feature_names) | |
| except ValueError: | |
| topic_names[cluster_id] = "Short / Common Phrases" | |
| return topic_names | |
| def perform_clustering(sentences): | |
| if not sentences: | |
| return None, None, None, "No sentences to cluster." | |
| embeddings = embedding_model.encode(sentences) | |
| dbscan = DBSCAN(eps=1.0, min_samples=2) | |
| dbscan.fit(embeddings) | |
| cluster_assignments = dbscan.labels_ | |
| topic_names = get_cluster_topic_names(sentences, cluster_assignments) | |
| return embeddings, cluster_assignments, topic_names, f"Successfully clustered {len(sentences)} sentences." | |
| def create_cluster_plot(embeddings, cluster_assignments, topic_names): | |
| if embeddings is None: | |
| return None | |
| pca = PCA(n_components=2) | |
| reduced_embeddings = pca.fit_transform(embeddings) | |
| fig, ax = plt.subplots(figsize=(12, 10)) | |
| unique_labels = sorted(set(cluster_assignments)) | |
| colors = [plt.cm.viridis(each) for each in np.linspace(0, 1, len(unique_labels))] | |
| for k, col in zip(unique_labels, colors): | |
| label = topic_names.get(k, "Unknown") | |
| if k == -1: col = [0, 0, 0, 1] | |
| class_member_mask = (cluster_assignments == k) | |
| xy = reduced_embeddings[class_member_mask] | |
| ax.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), | |
| markeredgecolor='k', markersize=14 if k != -1 else 7, label=label) | |
| ax.set_title("Semantic Topic Clusters from PDF Document") | |
| ax.legend(title="Topics") | |
| return fig | |
| def show_cluster_sentences(selected_topic, topics_dict, assignments_list, sentences_list): | |
| if not selected_topic: | |
| return pd.DataFrame(columns=["Sentences"]), "Select a topic to see sample sentences." | |
| try: | |
| cluster_id = [key for key, value in topics_dict.items() if value == selected_topic][0] | |
| matching_sentences = [] | |
| for sentence, assignment in zip(sentences_list, assignments_list): | |
| if assignment == cluster_id: | |
| matching_sentences.append(sentence) | |
| df = pd.DataFrame(matching_sentences, columns=["Sentences"]) | |
| status = f"Showing {len(matching_sentences)} sentences for topic: '{selected_topic}'" | |
| return df, status | |
| except Exception as e: | |
| return pd.DataFrame(), f"Error finding sentences: {e}" | |
| def run_bertopic_modeling(sentences): | |
| if not sentences: | |
| return None, None, "No sentences to model. Please process a report first." | |
| try: | |
| print("Starting BERTopic modeling...") | |
| topic_model = BERTopic(verbose=False, min_topic_size=6, embedding_model=embedding_model) | |
| topics, probs = topic_model.fit_transform(sentences) | |
| # Get topic info for a table | |
| topic_info = topic_model.get_topic_info() | |
| # Get the barchart | |
| fig = topic_model.visualize_barchart(top_n_topics=10) | |
| print("BERTopic modeling complete.") | |
| return fig, topic_info, "BERTopic analysis complete." | |
| except Exception as e: | |
| return None, None, f"Error during BERTopic analysis: {e}" | |
| def linguistic_analysis_spacy(text): | |
| if not text or not text.strip(): | |
| return [], "<p>Please enter text for analysis.</p>" | |
| doc = nlp(text) # Uses the global nlp model | |
| pos_tags = [(t.text, t.pos_, t.dep_) for t in doc] | |
| # Generate the raw SVG from displacy, ensuring text is dark | |
| options = {'distance': 110,'compact': 'True','color':'#fff','bg':'#00a65a',"font": "sans-serif"} | |
| svg = displacy.render(doc, style="dep", jupyter=False, options=options) | |
| html_wrapper = f""" | |
| <div style="background-color: white; border: 1px solid #E5E7EB; border-radius: 8px; padding: 12px; overflow-x: auto;"> | |
| {svg} | |
| </div> | |
| """ | |
| return pos_tags, html_wrapper | |
| # --- GRADIO WORKFLOW FUNCTIONS --- | |
| def unified_process_report(file_obj): | |
| if file_obj is None: | |
| return "Please upload a PDF file.", [] | |
| if not NER_MODEL_INITIALIZED: | |
| return "CRITICAL: NER Model failed to load.", [] | |
| text = extract_pdf_text(file_obj.name) | |
| if text.startswith("Error"): | |
| return text, [] | |
| sentences = clean_and_split_sentences(text) | |
| preprocessed_sentences_for_state = [preprocess_text(s) for s in sentences] | |
| status = f"Processed {len(sentences)} clean sentences successfully." | |
| try: | |
| html_file_path, kg_status = generate_llm_kg(file_obj) | |
| combined_status = f"\n{kg_status}" | |
| except Exception as e: | |
| html_file_path, combined_status = None, f"{status}\nError generating Knowledge Graph: {e}" | |
| return status, preprocessed_sentences_for_state, html_file_path, combined_status | |
| def run_clustering_workflow(sentences): | |
| embeddings, labels, topics, status = perform_clustering(sentences) | |
| plot = create_cluster_plot(embeddings, labels, topics) | |
| topic_name_list = list(topics.values()) | |
| sentence_df = pd.DataFrame(sentences, columns=["Sentences"]) | |
| return plot, status, labels, topics, gr.Dropdown(choices=topic_name_list), sentence_df | |
| def run_batch_analysis(sentences): | |
| cti_df, cti_status = batch_cti_classification(sentences) | |
| sent_df, sent_status = batch_sentiment_analysis(sentences) | |
| full_status = f"CTI: {cti_status} | Sentiment: {sent_status}" | |
| return cti_df, sent_df, full_status | |
| def on_click(pdf_file): | |
| html_file_path, status = generate_llm_kg(pdf_file) | |
| return html_file_path, status | |
| # --- GRADIO INTERFACE LAYOUT --- | |
| with gr.Blocks(title="CTI Analysis Tool", theme=gr.themes.Soft()) as app: | |
| gr.Markdown("# Cyber Threat Intelligence (CTI) Analysis Tool") | |
| gr.Markdown("Upload a CTI report (PDF) to analyze entities and semantic topics.") | |
| # --- State Variables --- | |
| sentences_state = gr.State([]) | |
| cluster_assignments_state = gr.State([]) | |
| cluster_topics_state = gr.State({}) | |
| # --- Main Upload Row --- | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload CTI Report (PDF)", file_types=[".pdf"]) | |
| process_button = gr.Button("Process Report", variant="primary") | |
| status_output = gr.Textbox(label="Processing Status", interactive=False) | |
| # --- Tabs --- | |
| with gr.Tabs(): | |
| with gr.TabItem("Knowledge Graph Analyzer"): | |
| gr.Markdown("### Knowledge Graph") | |
| llm_status = gr.Textbox(label="Status", interactive=False) | |
| llm_graph_output_file = gr.File(label="Knowledge Graph HTML File", file_types=[".html"], interactive=False) | |
| with gr.TabItem("Knowledge Graph QnA"): | |
| gr.Markdown("### Ask Questions About the Knowledge Graph") | |
| user_query = gr.Textbox(label="Enter your question", placeholder="e.g., Which malware communicates with example.com?") | |
| ask_button = gr.Button("Get Answer") | |
| answer_box = gr.Textbox(label="Answer", lines=5, interactive=False) | |
| with gr.TabItem("Semantic Topic Clustering"): | |
| gr.Markdown("### Group Sentences by Semantic Meaning (DBSCAN)") | |
| cluster_button = gr.Button("1. Cluster PDF Sentences", variant="secondary") | |
| cluster_status = gr.Textbox(label="Clustering Status", interactive=False) | |
| gr.Markdown("#### Sentences Used for Clustering") | |
| input_sentence_df = gr.DataFrame(headers=["Sentences"], label="Input Sentences", interactive=False, row_count=10) | |
| gr.Markdown("#### Cluster Visualization") | |
| cluster_plot_output = gr.Plot(label="Sentence Cluster Visualization") | |
| gr.Markdown("### Explore Clusters") | |
| with gr.Row(): | |
| topic_dropdown = gr.Dropdown(label="Select Topic", choices=[], interactive=True, scale=3) | |
| cluster_sentence_df = gr.DataFrame(headers=["Sentences"], label="Sentences in Selected Cluster", interactive=False, scale=4, row_count=10) | |
| with gr.TabItem("Document Summary"): | |
| gr.Markdown("### Sentiment & CTI Summary") | |
| analyze_pdf_button = gr.Button("Analyze PDF Sentences", variant="primary") | |
| summary_status = gr.Textbox(label="Analysis Status", interactive=False) | |
| gr.Markdown("#### CTI Keyword Summary") | |
| cti_summary_output = gr.DataFrame(headers=["CTI Topic", "Mentions", "Example Sentence"], label="CTI Summary") | |
| gr.Markdown("#### Sentiment Analysis") | |
| sentiment_summary_output = gr.DataFrame(headers=["Label", "Score", "Sentence"], label="Sentiment Highlights", row_count=10) | |
| # --- NEW: BERTopic Tab --- | |
| with gr.TabItem("Topic Modeling (BERTopic)"): | |
| gr.Markdown("### Advanced Topic Modeling with BERTopic") | |
| gr.Markdown("Run BERTopic on the full list of cleaned sentences to discover themes.") | |
| bertopic_button = gr.Button("Run Topic Model", variant="secondary") | |
| bertopic_status = gr.Textbox(label="BERTopic Status", interactive=False) | |
| gr.Markdown("#### Top 10 Discovered Topics") | |
| bertopic_plot = gr.Plot(label="BERTopic Barchart") | |
| gr.Markdown("#### All Discovered Topics") | |
| bertopic_df = gr.DataFrame(label="BERTopic Topic List") | |
| # --- NEW: Linguistic Analysis Tab --- | |
| with gr.TabItem("Linguistic Analysis (spaCy)"): | |
| gr.Markdown("### POS Tagging & Dependency Parsing") | |
| gr.Markdown("Analyze the grammatical structure of a single sentence.") | |
| ling_input = gr.Textbox(label="Enter a sentence to analyze", lines=3, placeholder="e.g., Copy a sentence from the cluster results...") | |
| ling_button = gr.Button("Analyze Syntax") | |
| gr.Markdown("#### Part-of-Speech (POS) Tags") | |
| ling_pos_df = gr.DataFrame(headers=["Token", "POS", "Dependency"], label="POS Tags", row_count=10) | |
| gr.Markdown("#### Dependency Plot") | |
| ling_dep_html = gr.HTML(label="Dependency Visualization") | |
| # --- EVENT HANDLERS --- | |
| process_button.click( | |
| fn=unified_process_report, | |
| inputs=[file_input], | |
| outputs=[status_output, sentences_state, llm_graph_output_file, llm_status] | |
| ) | |
| cluster_button.click( | |
| fn=run_clustering_workflow, | |
| inputs=[sentences_state], | |
| outputs=[ | |
| cluster_plot_output, | |
| cluster_status, | |
| cluster_assignments_state, | |
| cluster_topics_state, | |
| topic_dropdown, | |
| input_sentence_df | |
| ] | |
| ) | |
| topic_dropdown.select( | |
| fn=show_cluster_sentences, | |
| inputs=[ | |
| topic_dropdown, | |
| cluster_topics_state, | |
| cluster_assignments_state, | |
| sentences_state | |
| ], | |
| outputs=[cluster_sentence_df, cluster_status] | |
| ) | |
| analyze_pdf_button.click( | |
| fn=run_batch_analysis, | |
| inputs=sentences_state, | |
| outputs=[cti_summary_output, sentiment_summary_output, summary_status] | |
| ) | |
| bertopic_button.click( | |
| fn=run_bertopic_modeling, | |
| inputs=[sentences_state], | |
| outputs=[bertopic_plot, bertopic_df, bertopic_status] | |
| ) | |
| ling_button.click( | |
| fn=linguistic_analysis_spacy, | |
| inputs=[ling_input], | |
| outputs=[ling_pos_df, ling_dep_html] | |
| ) | |
| ask_button.click( | |
| fn=answer_from_graph, | |
| inputs=[user_query], | |
| outputs=[answer_box] | |
| ) | |
| app.launch(debug=True) |