Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,55 +1,114 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
import plotly.express as px
|
|
|
|
|
|
|
|
|
|
| 4 |
import os
|
| 5 |
|
| 6 |
def process_file(file):
|
| 7 |
-
# Determine file extension
|
| 8 |
file_ext = os.path.splitext(file.name)[1].lower()
|
| 9 |
-
|
| 10 |
-
# Load file accordingly
|
| 11 |
if file_ext == '.csv':
|
| 12 |
df = pd.read_csv(file.name)
|
| 13 |
elif file_ext in ['.xls', '.xlsx']:
|
| 14 |
df = pd.read_excel(file.name)
|
| 15 |
else:
|
| 16 |
-
return "Unsupported file format", None, None
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
| 21 |
|
| 22 |
-
# Convert Unix timestamp to datetime
|
| 23 |
df['date'] = pd.to_datetime(df['timestamp'], unit='s')
|
| 24 |
-
|
| 25 |
-
# --- Plot 1: Messages per Day ---
|
| 26 |
df['date_only'] = df['date'].dt.date
|
|
|
|
|
|
|
| 27 |
messages_per_day = df.groupby("date_only").size().reset_index(name="count")
|
| 28 |
fig1 = px.line(messages_per_day, x="date_only", y="count", title="Messages per Day")
|
| 29 |
|
| 30 |
-
#
|
| 31 |
top_authors = df['author'].value_counts().nlargest(20).reset_index()
|
| 32 |
top_authors.columns = ['author', 'count']
|
| 33 |
fig2 = px.bar(top_authors, x='author', y='count', title="Top 20 Authors", text='count')
|
| 34 |
fig2.update_layout(xaxis_tickangle=-45)
|
| 35 |
|
| 36 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
os.remove(file.name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# Gradio interface
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
if __name__ == "__main__":
|
| 55 |
-
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
import plotly.express as px
|
| 4 |
+
import networkx as nx
|
| 5 |
+
from pyvis.network import Network
|
| 6 |
+
import tempfile
|
| 7 |
import os
|
| 8 |
|
| 9 |
def process_file(file):
|
|
|
|
| 10 |
file_ext = os.path.splitext(file.name)[1].lower()
|
|
|
|
|
|
|
| 11 |
if file_ext == '.csv':
|
| 12 |
df = pd.read_csv(file.name)
|
| 13 |
elif file_ext in ['.xls', '.xlsx']:
|
| 14 |
df = pd.read_excel(file.name)
|
| 15 |
else:
|
| 16 |
+
return "Unsupported file format", None, None, None, None
|
| 17 |
|
| 18 |
+
required_cols = ['timestamp', 'author', 'text', 'id', 'parent']
|
| 19 |
+
missing = [col for col in required_cols if col not in df.columns]
|
| 20 |
+
if missing:
|
| 21 |
+
return f"Missing columns: {', '.join(missing)}", None, None, None, None
|
| 22 |
|
|
|
|
| 23 |
df['date'] = pd.to_datetime(df['timestamp'], unit='s')
|
|
|
|
|
|
|
| 24 |
df['date_only'] = df['date'].dt.date
|
| 25 |
+
|
| 26 |
+
# Messages per Day
|
| 27 |
messages_per_day = df.groupby("date_only").size().reset_index(name="count")
|
| 28 |
fig1 = px.line(messages_per_day, x="date_only", y="count", title="Messages per Day")
|
| 29 |
|
| 30 |
+
# Top Authors
|
| 31 |
top_authors = df['author'].value_counts().nlargest(20).reset_index()
|
| 32 |
top_authors.columns = ['author', 'count']
|
| 33 |
fig2 = px.bar(top_authors, x='author', y='count', title="Top 20 Authors", text='count')
|
| 34 |
fig2.update_layout(xaxis_tickangle=-45)
|
| 35 |
|
| 36 |
+
# Timeline of Comments
|
| 37 |
+
fig3 = px.scatter(df, x='date', y=[0]*len(df),
|
| 38 |
+
hover_data=['author', 'text', 'like_count'],
|
| 39 |
+
title="Comments Over Time", labels={'y': ''})
|
| 40 |
+
fig3.update_traces(marker=dict(size=6, opacity=0.5))
|
| 41 |
+
fig3.update_layout(yaxis=dict(showticklabels=False))
|
| 42 |
+
|
| 43 |
+
# Save to CSV for keyword search
|
| 44 |
+
df.to_csv("latest_data.csv", index=False)
|
| 45 |
+
|
| 46 |
os.remove(file.name)
|
| 47 |
+
return "Success", fig1, fig2, fig3, build_network_html(df)
|
| 48 |
+
|
| 49 |
+
def build_network_html(df):
|
| 50 |
+
G = nx.DiGraph()
|
| 51 |
+
|
| 52 |
+
for _, row in df.iterrows():
|
| 53 |
+
author = str(row['author'])
|
| 54 |
+
comment_id = str(row['id'])
|
| 55 |
+
parent_id = row['parent']
|
| 56 |
+
|
| 57 |
+
# Add node for comment
|
| 58 |
+
G.add_node(comment_id, label=author)
|
| 59 |
+
|
| 60 |
+
# Add edge from author to comment
|
| 61 |
+
G.add_edge(author, comment_id)
|
| 62 |
+
|
| 63 |
+
# Add edge from parent to this comment (thread link)
|
| 64 |
+
if pd.notna(parent_id):
|
| 65 |
+
G.add_edge(str(parent_id), comment_id)
|
| 66 |
|
| 67 |
+
net = Network(height="400px", width="100%", notebook=False, directed=True)
|
| 68 |
+
net.from_nx(G)
|
| 69 |
+
|
| 70 |
+
tmp_dir = tempfile.mkdtemp()
|
| 71 |
+
html_path = os.path.join(tmp_dir, "net.html")
|
| 72 |
+
net.show(html_path)
|
| 73 |
+
|
| 74 |
+
with open(html_path, "r", encoding="utf-8") as f:
|
| 75 |
+
html_content = f.read()
|
| 76 |
+
|
| 77 |
+
return html_content
|
| 78 |
+
|
| 79 |
+
def search_keyword(keyword):
|
| 80 |
+
if not os.path.exists("latest_data.csv"):
|
| 81 |
+
return pd.DataFrame(columns=['timestamp', 'author', 'text'])
|
| 82 |
+
|
| 83 |
+
df = pd.read_csv("latest_data.csv")
|
| 84 |
+
|
| 85 |
+
if 'text' not in df.columns:
|
| 86 |
+
return pd.DataFrame(columns=['timestamp', 'author', 'text'])
|
| 87 |
+
|
| 88 |
+
mask = df['text'].astype(str).str.contains(keyword, case=False, na=False)
|
| 89 |
+
result = df.loc[mask, ['timestamp', 'author', 'text', 'like_count']].head(100)
|
| 90 |
+
result['date'] = pd.to_datetime(result['timestamp'], unit='s')
|
| 91 |
+
return result[['date', 'author', 'like_count', 'text']]
|
| 92 |
|
| 93 |
# Gradio interface
|
| 94 |
+
with gr.Blocks() as demo:
|
| 95 |
+
gr.Markdown("## π§ Comment Thread Analyzer with Timeline + Search")
|
| 96 |
+
file_input = gr.File(label="π Upload CSV or XLSX", file_types=[".csv", ".xls", ".xlsx"])
|
| 97 |
+
|
| 98 |
+
status = gr.Textbox(label="β
Status")
|
| 99 |
+
plot1 = gr.Plot(label="π Messages per Day")
|
| 100 |
+
plot2 = gr.Plot(label="π€ Top 20 Authors")
|
| 101 |
+
timeline = gr.Plot(label="π Comment Timeline")
|
| 102 |
+
network_html = gr.HTML(label="π§΅ Thread Network")
|
| 103 |
+
|
| 104 |
+
with gr.Row():
|
| 105 |
+
keyword_input = gr.Textbox(label="π Search Keyword in Comments")
|
| 106 |
+
search_button = gr.Button("Search")
|
| 107 |
+
|
| 108 |
+
search_results = gr.Dataframe(headers=["date", "author", "like_count", "text"], label="π Search Results")
|
| 109 |
+
|
| 110 |
+
file_input.change(fn=process_file, inputs=file_input, outputs=[status, plot1, plot2, timeline, network_html])
|
| 111 |
+
search_button.click(fn=search_keyword, inputs=keyword_input, outputs=search_results)
|
| 112 |
|
| 113 |
if __name__ == "__main__":
|
| 114 |
+
demo.launch()
|