File size: 3,639 Bytes
bb3e7d0
 
 
b160c3d
 
9a034e8
bb3e7d0
 
b160c3d
5ce4f5a
b160c3d
5ce4f5a
 
 
 
 
 
b160c3d
5ce4f5a
 
 
 
 
b160c3d
5ce4f5a
 
 
 
 
 
b160c3d
 
5ce4f5a
b160c3d
 
9a034e8
bb3e7d0
 
b160c3d
5ce4f5a
9a034e8
bb3e7d0
5ce4f5a
9a034e8
bb3e7d0
 
5ce4f5a
9a034e8
5ce4f5a
 
b160c3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a034e8
bb3e7d0
5ce4f5a
9a034e8
bb3e7d0
5ce4f5a
b160c3d
bb3e7d0
9a034e8
bb3e7d0
 
 
9a034e8
 
bb3e7d0
5ce4f5a
 
 
9a034e8
5ce4f5a
9a034e8
b160c3d
9a034e8
bb3e7d0
5ce4f5a
9a034e8
bb3e7d0
 
b160c3d
bb3e7d0
 
9a034e8
5ce4f5a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import gradio as gr
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import os
import pandas as pd

def run_from_textfile(file):
    if file is None:
        return "Please upload a .txt file.", "", "", None
    
    # ---- Handle file input ----
    text = ""
    
    if hasattr(file, 'decode'):
        try:
            text = file.decode("utf-8")
        except Exception as e:
            return f"Error decoding NamedString: {e}", "", "", None
    
    elif hasattr(file, 'read'):
        try:
            text = file.read().decode("utf-8")
        except Exception as e:
            return f"Error reading/decoding file object: {e}", "", "", None
    
    elif isinstance(file, str) and os.path.exists(file):
        try:
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read()
        except Exception as e:
            return f"Error reading file from path: {e}", "", "", None
    
    if not text:
         return "Could not read the file content. Please check the file type and content.", "", "", None
    
    # Split the text into documents (one per line)
    docs = [line.strip() for line in text.split("\n") if line.strip()]
    if len(docs) < 3:
        return "Need at least 3 documents (one per line).", "", "", None
    
    # ---- Embedding Model ----
    embedder = SentenceTransformer("all-MiniLM-L6-v2")
    
    # ---- Topic Modeling ----
    topic_model = BERTopic(embedding_model=embedder)
    topics, probs = topic_model.fit_transform(docs)
    
    # ---- Topic Summary ----
    topic_info = topic_model.get_topic_info().to_string(index=False)
    
    # ---- TOPIC WEIGHTS (Word Importance per Topic) ----
    weights_output = "=" * 80 + "\n"
    weights_output += "TOPIC WEIGHTS (Word Importance Scores)\n"
    weights_output += "=" * 80 + "\n\n"
    
    # Get all topics except outlier topic (-1)
    all_topics = [t for t in topic_model.get_topics().keys() if t != -1]
    
    for topic_id in all_topics:
        weights_output += f"TOPIC {topic_id}\n"
        weights_output += "-" * 40 + "\n"
        
        # Get top words and their weights for this topic
        topic_words = topic_model.get_topic(topic_id)
        
        if topic_words:
            for word, weight in topic_words[:10]:  # Top 10 words
                weights_output += f"  {word:20s} {weight:8.4f}\n"
        
        weights_output += "\n"
    
    # ---- Document → Topic Assignments ----
    assignments = "\n".join([f"Doc {i+1}: Topic {topics[i]}" for i in range(len(docs))])
    
    # ---- Visualization ----
    fig = topic_model.visualize_barchart(top_n_topics=10)
    
    return topic_info, weights_output, assignments, fig

# ---- Gradio Interface ----
with gr.Blocks() as demo:
    gr.Markdown("# 🧠 Topic Modeling from TXT File (BERTopic)")
    gr.Markdown(
        "Upload a plain text (.txt) file. Each line should contain **one LLM response**.\n"
        "\nExample format:\n```\nResponse 1...\nResponse 2...\nResponse 3...\n```"
    )
    
    file_input = gr.File(label="Upload .txt file") 
    
    run_button = gr.Button("Run Topic Modeling")
    
    topic_output = gr.Textbox(label="Topic Overview", lines=12)
    weights_output = gr.Textbox(label="📊 Topic Weights (Word Importance)", lines=20)
    assignment_output = gr.Textbox(label="Document → Topic Assignments", lines=12)
    fig_output = gr.Plot(label="Topic Visualization")
    
    run_button.click(
        fn=run_from_textfile,
        inputs=file_input,
        outputs=[topic_output, weights_output, assignment_output, fig_output]
    )

# Launch app
demo.launch()