Spaces:
Sleeping
Sleeping
File size: 3,639 Bytes
bb3e7d0 b160c3d 9a034e8 bb3e7d0 b160c3d 5ce4f5a b160c3d 5ce4f5a b160c3d 5ce4f5a b160c3d 5ce4f5a b160c3d 5ce4f5a b160c3d 9a034e8 bb3e7d0 b160c3d 5ce4f5a 9a034e8 bb3e7d0 5ce4f5a 9a034e8 bb3e7d0 5ce4f5a 9a034e8 5ce4f5a b160c3d 9a034e8 bb3e7d0 5ce4f5a 9a034e8 bb3e7d0 5ce4f5a b160c3d bb3e7d0 9a034e8 bb3e7d0 9a034e8 bb3e7d0 5ce4f5a 9a034e8 5ce4f5a 9a034e8 b160c3d 9a034e8 bb3e7d0 5ce4f5a 9a034e8 bb3e7d0 b160c3d bb3e7d0 9a034e8 5ce4f5a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | import gradio as gr
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import os
import pandas as pd
def run_from_textfile(file):
if file is None:
return "Please upload a .txt file.", "", "", None
# ---- Handle file input ----
text = ""
if hasattr(file, 'decode'):
try:
text = file.decode("utf-8")
except Exception as e:
return f"Error decoding NamedString: {e}", "", "", None
elif hasattr(file, 'read'):
try:
text = file.read().decode("utf-8")
except Exception as e:
return f"Error reading/decoding file object: {e}", "", "", None
elif isinstance(file, str) and os.path.exists(file):
try:
with open(file, 'r', encoding='utf-8') as f:
text = f.read()
except Exception as e:
return f"Error reading file from path: {e}", "", "", None
if not text:
return "Could not read the file content. Please check the file type and content.", "", "", None
# Split the text into documents (one per line)
docs = [line.strip() for line in text.split("\n") if line.strip()]
if len(docs) < 3:
return "Need at least 3 documents (one per line).", "", "", None
# ---- Embedding Model ----
embedder = SentenceTransformer("all-MiniLM-L6-v2")
# ---- Topic Modeling ----
topic_model = BERTopic(embedding_model=embedder)
topics, probs = topic_model.fit_transform(docs)
# ---- Topic Summary ----
topic_info = topic_model.get_topic_info().to_string(index=False)
# ---- TOPIC WEIGHTS (Word Importance per Topic) ----
weights_output = "=" * 80 + "\n"
weights_output += "TOPIC WEIGHTS (Word Importance Scores)\n"
weights_output += "=" * 80 + "\n\n"
# Get all topics except outlier topic (-1)
all_topics = [t for t in topic_model.get_topics().keys() if t != -1]
for topic_id in all_topics:
weights_output += f"TOPIC {topic_id}\n"
weights_output += "-" * 40 + "\n"
# Get top words and their weights for this topic
topic_words = topic_model.get_topic(topic_id)
if topic_words:
for word, weight in topic_words[:10]: # Top 10 words
weights_output += f" {word:20s} {weight:8.4f}\n"
weights_output += "\n"
# ---- Document → Topic Assignments ----
assignments = "\n".join([f"Doc {i+1}: Topic {topics[i]}" for i in range(len(docs))])
# ---- Visualization ----
fig = topic_model.visualize_barchart(top_n_topics=10)
return topic_info, weights_output, assignments, fig
# ---- Gradio Interface ----
with gr.Blocks() as demo:
gr.Markdown("# 🧠 Topic Modeling from TXT File (BERTopic)")
gr.Markdown(
"Upload a plain text (.txt) file. Each line should contain **one LLM response**.\n"
"\nExample format:\n```\nResponse 1...\nResponse 2...\nResponse 3...\n```"
)
file_input = gr.File(label="Upload .txt file")
run_button = gr.Button("Run Topic Modeling")
topic_output = gr.Textbox(label="Topic Overview", lines=12)
weights_output = gr.Textbox(label="📊 Topic Weights (Word Importance)", lines=20)
assignment_output = gr.Textbox(label="Document → Topic Assignments", lines=12)
fig_output = gr.Plot(label="Topic Visualization")
run_button.click(
fn=run_from_textfile,
inputs=file_input,
outputs=[topic_output, weights_output, assignment_output, fig_output]
)
# Launch app
demo.launch() |