Spaces:

Aaseem
/

cti-analyzer

Sleeping

App Files Files Community

Aaseem commited on Nov 15, 2025

Commit

06f7c7b

verified ·

1 Parent(s): 87554d6

Upload 2 files

Browse files

Files changed (2) hide show

app.py +603 -0
requirements.txt +17 -0

app.py ADDED Viewed

	@@ -0,0 +1,603 @@

+import nltk
+nltk.download('punkt_tab')
+nltk.download('stopwords')
+import os
+import gradio as gr
+from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
+from sentence_transformers import SentenceTransformer
+from sklearn.cluster import DBSCAN
+from sklearn.decomposition import PCA
+from sklearn.feature_extraction.text import TfidfVectorizer
+import igraph as ig
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+import warnings
+import nltk
+import re
+import spacy
+from spacy import displacy
+from bertopic import BERTopic
+from datetime import datetime
+import string
+from nltk.corpus import stopwords
+import pdfplumber
+import os
+import asyncio
+from pyvis.network import Network
+from langchain_core.documents import Document
+from langchain_experimental.graph_transformers import LLMGraphTransformer
+from langchain_openai import ChatOpenAI
+os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
+os.environ["HF_HUB_DISABLE_XET_BACKEND"] = "1"
+os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+# --- GLOBAL MODEL/PIPELINE INITIALIZATION ---
+llm = ChatOpenAI(temperature=0, model_name="gpt-4o")
+graph_transformer = LLMGraphTransformer(llm=llm)
+global_text_data = ""
+# 1. NER Model
+MODEL_NAME = "CyberPeace-Institute/SecureBERT-NER"
+NER_MODEL_INITIALIZED = False
+ner_tokenizer = None
+ner_pipeline = None
+try:
+    print("Attempting to load SecureBERT-NER Model...")
+    ner_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+    ner_model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
+    ner_pipeline = pipeline(
+        "token-classification",
+        model=ner_model,
+        tokenizer=ner_tokenizer,
+        aggregation_strategy="simple"
+    )
+    print("NER Model loaded successfully.")
+    NER_MODEL_INITIALIZED = True
+except Exception as e:
+    print(f"CRITICAL ERROR: Failed to load NER model. Knowledge Graph functionality will be disabled.")
+    print(f"Details: {e}")
+# 2. Sentence Embedding Model for Clustering
+try:
+    print("Attempting to load Sentence Transformer Model...")
+    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+    print("Sentence Transformer Model loaded successfully.")
+except Exception as e:
+    print(f"CRITICAL ERROR: Failed to load Sentence Transformer model. Clustering functionality will be disabled.")
+    print(f"Details: {e}")
+# 3. NLTK Tokenizer for Sentence Splitting
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    print("Downloading NLTK 'punkt' model...")
+    nltk.download('punkt')
+# 4. spaCy Model for Linguistic Analysis
+try:
+    print("Attempting to load spaCy Model...")
+    nlp = spacy.load("en_core_web_sm")
+    print("spaCy Model loaded successfully.")
+except Exception as e:
+    print(f"CRITICAL ERROR: Failed to load spaCy model: {e}")
+# 5. Sentiment Analysis Model
+sentiment_pipeline = None
+try:
+    print("Attempting to load Sentiment Model...")
+    sentiment_model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model_name)
+    print("Sentiment Pipeline loaded successfully.")
+except Exception as e:
+    print(f"CRITICAL ERROR: Failed to load Sentiment pipeline: {e}")
+# --- CORE UTILITY FUNCTIONS ---
+def extract_pdf_text(pdf_path):
+    try:
+        text = ""
+        with pdfplumber.open(pdf_path) as pdf:
+            for i, page in enumerate(pdf.pages):
+                page_text = page.extract_text()
+                if page_text:
+                    page_text = page_text.replace("\xa0", " ").strip()
+                    text += page_text + "\n\n"
+        if not text.strip():
+            return "Error: No extractable text found in this PDF (it may be scanned or image-based)."
+        return text
+    except Exception as e:
+        return f"Error reading PDF file with pdfplumber: {type(e).__name__}: {str(e)}"
+def chunk_text(text, max_length=512, overlap=50):
+    if not NER_MODEL_INITIALIZED: return ["Model not loaded."]
+    tokens = ner_tokenizer.encode(text, add_special_tokens=False)
+    chunks = [ner_tokenizer.decode(tokens[i:i + max_length]) for i in range(0, len(tokens), max_length - overlap)]
+    return chunks
+def clean_and_split_sentences(text):
+    sentences = nltk.sent_tokenize(text)
+    clean_sentences = []
+    for sentence in sentences:
+        sentence = re.sub(r'\s+', ' ', sentence).strip()
+        word_count = len(sentence.split())
+        if word_count < 4 or word_count > 256:
+            continue
+        if not re.search(r'[a-zA-Z]{3,}', sentence):
+            continue
+        if sentence.lower().startswith(("figure ", "table ", "page ", "©", "appendix ")):
+            continue
+        clean_sentences.append(sentence)
+    return clean_sentences
+def remove_punc_fast(text):
+    exclude = string.punctuation
+    return text.translate(str.maketrans('', '', exclude))
+def remove_stopwords(text):
+    english_stopwords = stopwords.words('english')
+    new_text = []
+    for word in text.split():
+        if word in english_stopwords:
+            new_text.append('')
+        else:
+            new_text.append(word)
+    return " ".join(new_text)
+def clean_entity_names(entity_names):
+  cleaned_words = []
+  for word in entity_names:
+    cleaned = re.sub(r'[^a-zA-Z\s]', '', word)
+    cleaned = cleaned.strip()
+    if cleaned:
+      cleaned_words.append(cleaned)
+  return cleaned_words
+def preprocess_text(text):
+  text = text.lower()
+  text = remove_punc_fast(text)
+  text = remove_stopwords(text)
+  return text
+async def extract_graph_data_async(text):
+    documents = [Document(page_content=text)]
+    graph_documents = await graph_transformer.aconvert_to_graph_documents(documents)
+    return graph_documents
+def visualize_graph(graph_documents, output_file="llm_knowledge_graph.html"):
+    net = Network(height="800px", width="100%", directed=True,
+                  notebook=False, bgcolor="#222222", font_color="white", filter_menu=True, cdn_resources='remote')
+    if not graph_documents or not graph_documents[0].nodes:
+        net.save_graph(output_file)
+        return output_file
+    nodes = graph_documents[0].nodes
+    relationships = graph_documents[0].relationships
+    node_dict = {node.id: node for node in nodes}
+    valid_edges = [rel for rel in relationships if rel.source.id in node_dict and rel.target.id in node_dict]
+    valid_node_ids = set([rel.source.id for rel in valid_edges] + [rel.target.id for rel in valid_edges])
+    for node_id in valid_node_ids:
+        node = node_dict[node_id]
+        net.add_node(node.id, label=node.id, title=node.type, group=node.type)
+    for rel in valid_edges:
+        net.add_edge(rel.source.id, rel.target.id, label=rel.type.lower())
+    net.set_options("""
+        {
+            "physics": {
+                "forceAtlas2Based": {
+                    "gravitationalConstant": -100,
+                    "centralGravity": 0.01,
+                    "springLength": 200,
+                    "springConstant": 0.08,
+                    "avoidOverlap": 0.5
+                },
+                "minVelocity": 0.75,
+                "solver": "forceAtlas2Based"
+            }
+        }
+    """)
+    net.save_graph(output_file)
+    return output_file
+def generate_llm_kg(pdf_file):
+    global global_text_data
+    if pdf_file is None:
+        return None, "Please upload a PDF file."
+    text = extract_pdf_text(pdf_file.name)
+    try:
+        graph_documents = asyncio.run(extract_graph_data_async(text))
+        unique_file_name = "llm_knowledge_graph.html"
+        html_file_path = visualize_graph(graph_documents, output_file=unique_file_name)
+        global_text_data = text  # Store extracted text for QnA
+        return html_file_path, "LLM Knowledge Graph generated successfully! You can now ask questions in the QnA tab."
+    except Exception as e:
+        return None, f"Error generating LLM knowledge graph: {e}"
+def answer_from_graph(query):
+    global global_text_data
+    if not global_text_data:
+        return "Please generate a Knowledge Graph first by uploading a PDF."
+    prompt = f"""
+You are a helpful assistant. Use the following extracted content from a PDF to answer questions concisely.
+Content:
+{global_text_data}
+Question: {query}
+Answer:
+"""
+    try:
+        response = llm.invoke(prompt)
+        return response.content.strip()
+    except Exception as e:
+        return f"Error generating answer: {e}"
+def batch_sentiment_analysis(sentences):
+    """
+    Analyzes a list of sentences in a fast batch.
+    """
+    if not sentences:
+        return pd.DataFrame(columns=["Label", "Score", "Sentence"]), "No sentences to analyze."
+    if sentiment_pipeline is None:
+        return pd.DataFrame(), "Sentiment pipeline not loaded."
+    try:
+        results = sentiment_pipeline(sentences, truncation=True)
+        df = pd.DataFrame(results)
+        valid_sentences = sentences[:len(df)]
+        df['Sentence'] = valid_sentences
+        df['Score'] = df['score'].round(3)
+        df['Label'] = df['label']
+        positive_df = df[df['Label'] == 'POSITIVE'].nlargest(5, 'Score')
+        negative_df = df[df['Label'] == 'NEGATIVE'].nlargest(5, 'Score')
+        summary_df = pd.concat([positive_df, negative_df]).sort_values('Score', ascending=False)
+        return summary_df[['Label', 'Score', 'Sentence']], f"Analyzed {len(sentences)} sentences."
+    except Exception as e:
+        return pd.DataFrame(), f"Error during sentiment analysis: {e}"
+def batch_cti_classification(sentences):
+    if not sentences:
+        return pd.DataFrame(columns=["CTI Topic", "Mentions", "Example Sentence"]), "No sentences to analyze."
+    keywords = {
+        "Phishing": ["phishing", "vishing", "smishing"],
+        "Malware": ["malware", "ransomware", "trojan", "keylogger", "emotet"],
+        "Vulnerability": ["cve-", "vulnerability", "zero-day"],
+        "Attack": ["attack", "breach", "incident", "apt-", "ddos"],
+        "Exploit": ["exploit", "exploited", "rce", "remote code execution"],
+    }
+    topic_summary = {topic: {"count": 0, "example": ""} for topic in keywords}
+    for sentence in sentences:
+        sentence_lower = sentence.lower()
+        found_in_sentence = set()
+        for topic, words in keywords.items():
+            for word in words:
+                if word in sentence_lower:
+                    if topic not in found_in_sentence:
+                        topic_summary[topic]["count"] += 1
+                        if not topic_summary[topic]["example"]:
+                            topic_summary[topic]["example"] = sentence
+                        found_in_sentence.add(topic)
+    summary_list = []
+    for topic, data in topic_summary.items():
+        if data["count"] > 0:
+            summary_list.append({
+                "CTI Topic": topic,
+                "Mentions": data["count"],
+                "Example Sentence": data["example"]
+            })
+    if not summary_list:
+        return pd.DataFrame([{"CTI Topic": "No CTI Keywords Found", "Mentions": 0, "Example Sentence": ""}]), "No CTI keywords found in document."
+    summary_df = pd.DataFrame(summary_list).sort_values("Mentions", ascending=False)
+    return summary_df, f"Scanned {len(sentences)} sentences for CTI terms."
+def get_cluster_topic_names(sentences, cluster_assignments):
+    clustered_sentences = {i: [] for i in set(cluster_assignments)}
+    for sentence, cluster_id in zip(sentences, cluster_assignments):
+        clustered_sentences[cluster_id].append(sentence)
+    topic_names = {}
+    for cluster_id, docs in clustered_sentences.items():
+        if cluster_id == -1:
+            topic_names[cluster_id] = "Outliers / Miscellaneous"
+            continue
+        try:
+            vectorizer = TfidfVectorizer(stop_words='english', max_features=3, ngram_range=(1, 2))
+            corpus = [" ".join(docs)]
+            vectorizer.fit(corpus)
+            feature_names = vectorizer.get_feature_names_out()
+            topic_names[cluster_id] = ", ".join(feature_names)
+        except ValueError:
+            topic_names[cluster_id] = "Short / Common Phrases"
+    return topic_names
+def perform_clustering(sentences):
+    if not sentences:
+        return None, None, None, "No sentences to cluster."
+    embeddings = embedding_model.encode(sentences)
+    dbscan = DBSCAN(eps=1.0, min_samples=2)
+    dbscan.fit(embeddings)
+    cluster_assignments = dbscan.labels_
+    topic_names = get_cluster_topic_names(sentences, cluster_assignments)
+    return embeddings, cluster_assignments, topic_names, f"Successfully clustered {len(sentences)} sentences."
+def create_cluster_plot(embeddings, cluster_assignments, topic_names):
+    if embeddings is None:
+        return None
+    pca = PCA(n_components=2)
+    reduced_embeddings = pca.fit_transform(embeddings)
+    fig, ax = plt.subplots(figsize=(12, 10))
+    unique_labels = sorted(set(cluster_assignments))
+    colors = [plt.cm.viridis(each) for each in np.linspace(0, 1, len(unique_labels))]
+    for k, col in zip(unique_labels, colors):
+        label = topic_names.get(k, "Unknown")
+        if k == -1: col = [0, 0, 0, 1]
+        class_member_mask = (cluster_assignments == k)
+        xy = reduced_embeddings[class_member_mask]
+        ax.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
+                markeredgecolor='k', markersize=14 if k != -1 else 7, label=label)
+    ax.set_title("Semantic Topic Clusters from PDF Document")
+    ax.legend(title="Topics")
+    return fig
+def show_cluster_sentences(selected_topic, topics_dict, assignments_list, sentences_list):
+    if not selected_topic:
+        return pd.DataFrame(columns=["Sentences"]), "Select a topic to see sample sentences."
+    try:
+        cluster_id = [key for key, value in topics_dict.items() if value == selected_topic][0]
+        matching_sentences = []
+        for sentence, assignment in zip(sentences_list, assignments_list):
+            if assignment == cluster_id:
+                matching_sentences.append(sentence)
+        df = pd.DataFrame(matching_sentences, columns=["Sentences"])
+        status = f"Showing {len(matching_sentences)} sentences for topic: '{selected_topic}'"
+        return df, status
+    except Exception as e:
+        return pd.DataFrame(), f"Error finding sentences: {e}"
+def run_bertopic_modeling(sentences):
+    if not sentences:
+        return None, None, "No sentences to model. Please process a report first."
+    try:
+        print("Starting BERTopic modeling...")
+        topic_model = BERTopic(verbose=False, min_topic_size=6, embedding_model=embedding_model)
+        topics, probs = topic_model.fit_transform(sentences)
+        # Get topic info for a table
+        topic_info = topic_model.get_topic_info()
+        # Get the barchart
+        fig = topic_model.visualize_barchart(top_n_topics=10)
+        print("BERTopic modeling complete.")
+        return fig, topic_info, "BERTopic analysis complete."
+    except Exception as e:
+        return None, None, f"Error during BERTopic analysis: {e}"
+def linguistic_analysis_spacy(text):
+    if not text or not text.strip():
+        return [], "<p>Please enter text for analysis.</p>"
+    doc = nlp(text) # Uses the global nlp model
+    pos_tags = [(t.text, t.pos_, t.dep_) for t in doc]
+    # Generate the raw SVG from displacy, ensuring text is dark
+    options = {'distance': 110,'compact': 'True','color':'#fff','bg':'#00a65a',"font": "sans-serif"}
+    svg = displacy.render(doc, style="dep", jupyter=False, options=options)
+    html_wrapper = f"""
+    <div style="background-color: white; border: 1px solid #E5E7EB; border-radius: 8px; padding: 12px; overflow-x: auto;">
+        {svg}
+    </div>
+    """
+    return pos_tags, html_wrapper
+# --- GRADIO WORKFLOW FUNCTIONS ---
+def unified_process_report(file_obj):
+    if file_obj is None:
+        return "Please upload a PDF file.", []
+    if not NER_MODEL_INITIALIZED:
+        return "CRITICAL: NER Model failed to load.", []
+    text = extract_pdf_text(file_obj.name)
+    if text.startswith("Error"):
+        return text, []
+    sentences = clean_and_split_sentences(text)
+    preprocessed_sentences_for_state = [preprocess_text(s) for s in sentences]
+    status = f"Processed {len(sentences)} clean sentences successfully."
+    try:
+        html_file_path, kg_status = generate_llm_kg(file_obj)
+        combined_status = f"\n{kg_status}"
+    except Exception as e:
+        html_file_path, combined_status = None, f"{status}\nError generating Knowledge Graph: {e}"
+    return status, preprocessed_sentences_for_state, html_file_path, combined_status
+def run_clustering_workflow(sentences):
+    embeddings, labels, topics, status = perform_clustering(sentences)
+    plot = create_cluster_plot(embeddings, labels, topics)
+    topic_name_list = list(topics.values())
+    sentence_df = pd.DataFrame(sentences, columns=["Sentences"])
+    return plot, status, labels, topics, gr.Dropdown(choices=topic_name_list), sentence_df
+def run_batch_analysis(sentences):
+    cti_df, cti_status = batch_cti_classification(sentences)
+    sent_df, sent_status = batch_sentiment_analysis(sentences)
+    full_status = f"CTI: {cti_status} | Sentiment: {sent_status}"
+    return cti_df, sent_df, full_status
+def on_click(pdf_file):
+    html_file_path, status = generate_llm_kg(pdf_file)
+    return html_file_path, status
+# --- GRADIO INTERFACE LAYOUT ---
+with gr.Blocks(title="CTI Analysis Tool", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# Cyber Threat Intelligence (CTI) Analysis Tool")
+    gr.Markdown("Upload a CTI report (PDF) to analyze entities and semantic topics.")
+    # --- State Variables ---
+    sentences_state = gr.State([])
+    cluster_assignments_state = gr.State([])
+    cluster_topics_state = gr.State({})
+    # --- Main Upload Row ---
+    with gr.Row():
+        file_input = gr.File(label="Upload CTI Report (PDF)", file_types=[".pdf"])
+        process_button = gr.Button("Process Report", variant="primary")
+    status_output = gr.Textbox(label="Processing Status", interactive=False)
+    # --- Tabs ---
+    with gr.Tabs():
+        with gr.TabItem("Knowledge Graph Analyzer"):
+            gr.Markdown("### Knowledge Graph")
+            llm_status = gr.Textbox(label="Status", interactive=False)
+            llm_graph_output_file = gr.File(label="Knowledge Graph HTML File", file_types=[".html"], interactive=False)
+        with gr.TabItem("Knowledge Graph QnA"):
+            gr.Markdown("### Ask Questions About the Knowledge Graph")
+            user_query = gr.Textbox(label="Enter your question", placeholder="e.g., Which malware communicates with example.com?")
+            ask_button = gr.Button("Get Answer")
+            answer_box = gr.Textbox(label="Answer", lines=5, interactive=False)
+        with gr.TabItem("Semantic Topic Clustering"):
+            gr.Markdown("### Group Sentences by Semantic Meaning (DBSCAN)")
+            cluster_button = gr.Button("1. Cluster PDF Sentences", variant="secondary")
+            cluster_status = gr.Textbox(label="Clustering Status", interactive=False)
+            gr.Markdown("#### Sentences Used for Clustering")
+            input_sentence_df = gr.DataFrame(headers=["Sentences"], label="Input Sentences", interactive=False, row_count=10)
+            gr.Markdown("#### Cluster Visualization")
+            cluster_plot_output = gr.Plot(label="Sentence Cluster Visualization")
+            gr.Markdown("### Explore Clusters")
+            with gr.Row():
+                topic_dropdown = gr.Dropdown(label="Select Topic", choices=[], interactive=True, scale=3)
+                cluster_sentence_df = gr.DataFrame(headers=["Sentences"], label="Sentences in Selected Cluster", interactive=False, scale=4, row_count=10)
+        with gr.TabItem("Document Summary"):
+            gr.Markdown("### Sentiment & CTI Summary")
+            analyze_pdf_button = gr.Button("Analyze PDF Sentences", variant="primary")
+            summary_status = gr.Textbox(label="Analysis Status", interactive=False)
+            gr.Markdown("#### CTI Keyword Summary")
+            cti_summary_output = gr.DataFrame(headers=["CTI Topic", "Mentions", "Example Sentence"], label="CTI Summary")
+            gr.Markdown("#### Sentiment Analysis")
+            sentiment_summary_output = gr.DataFrame(headers=["Label", "Score", "Sentence"], label="Sentiment Highlights", row_count=10)
+        # --- NEW: BERTopic Tab ---
+        with gr.TabItem("Topic Modeling (BERTopic)"):
+            gr.Markdown("### Advanced Topic Modeling with BERTopic")
+            gr.Markdown("Run BERTopic on the full list of cleaned sentences to discover themes.")
+            bertopic_button = gr.Button("Run Topic Model", variant="secondary")
+            bertopic_status = gr.Textbox(label="BERTopic Status", interactive=False)
+            gr.Markdown("#### Top 10 Discovered Topics")
+            bertopic_plot = gr.Plot(label="BERTopic Barchart")
+            gr.Markdown("#### All Discovered Topics")
+            bertopic_df = gr.DataFrame(label="BERTopic Topic List")
+        # --- NEW: Linguistic Analysis Tab ---
+        with gr.TabItem("Linguistic Analysis (spaCy)"):
+            gr.Markdown("### POS Tagging & Dependency Parsing")
+            gr.Markdown("Analyze the grammatical structure of a single sentence.")
+            ling_input = gr.Textbox(label="Enter a sentence to analyze", lines=3, placeholder="e.g., Copy a sentence from the cluster results...")
+            ling_button = gr.Button("Analyze Syntax")
+            gr.Markdown("#### Part-of-Speech (POS) Tags")
+            ling_pos_df = gr.DataFrame(headers=["Token", "POS", "Dependency"], label="POS Tags", row_count=10)
+            gr.Markdown("#### Dependency Plot")
+            ling_dep_html = gr.HTML(label="Dependency Visualization")
+    # --- EVENT HANDLERS ---
+    process_button.click(
+        fn=unified_process_report,
+        inputs=[file_input],
+        outputs=[status_output, sentences_state, llm_graph_output_file, llm_status]
+    )
+    cluster_button.click(
+        fn=run_clustering_workflow,
+        inputs=[sentences_state],
+        outputs=[
+            cluster_plot_output,
+            cluster_status,
+            cluster_assignments_state,
+            cluster_topics_state,
+            topic_dropdown,
+            input_sentence_df
+        ]
+    )
+    topic_dropdown.select(
+        fn=show_cluster_sentences,
+        inputs=[
+            topic_dropdown,
+            cluster_topics_state,
+            cluster_assignments_state,
+            sentences_state
+        ],
+        outputs=[cluster_sentence_df, cluster_status]
+    )
+    analyze_pdf_button.click(
+        fn=run_batch_analysis,
+        inputs=sentences_state,
+        outputs=[cti_summary_output, sentiment_summary_output, summary_status]
+    )
+    bertopic_button.click(
+        fn=run_bertopic_modeling,
+        inputs=[sentences_state],
+        outputs=[bertopic_plot, bertopic_df, bertopic_status]
+    )
+    ling_button.click(
+        fn=linguistic_analysis_spacy,
+        inputs=[ling_input],
+        outputs=[ling_pos_df, ling_dep_html]
+    )
+    ask_button.click(
+       fn=answer_from_graph,
+       inputs=[user_query],
+       outputs=[answer_box]
+    )
+app.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+gradio
+transformers
+sentence-transformers
+scikit-learn
+pandas
+numpy
+matplotlib
+nltk
+spacy
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
+pdfplumber
+pyvis
+langchain
+langchain-openai
+langchain-experimental
+bertopic
+protobuf