clementBE commited on
Commit
d308445
Β·
verified Β·
1 Parent(s): 70a4e98

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -25
app.py CHANGED
@@ -1,55 +1,114 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
 
 
 
4
  import os
5
 
6
  def process_file(file):
7
- # Determine file extension
8
  file_ext = os.path.splitext(file.name)[1].lower()
9
-
10
- # Load file accordingly
11
  if file_ext == '.csv':
12
  df = pd.read_csv(file.name)
13
  elif file_ext in ['.xls', '.xlsx']:
14
  df = pd.read_excel(file.name)
15
  else:
16
- return "Unsupported file format", None, None
17
 
18
- # Ensure 'timestamp' column exists
19
- if 'timestamp' not in df.columns or 'author' not in df.columns:
20
- return "The file must contain 'timestamp' and 'author' columns.", None, None
 
21
 
22
- # Convert Unix timestamp to datetime
23
  df['date'] = pd.to_datetime(df['timestamp'], unit='s')
24
-
25
- # --- Plot 1: Messages per Day ---
26
  df['date_only'] = df['date'].dt.date
 
 
27
  messages_per_day = df.groupby("date_only").size().reset_index(name="count")
28
  fig1 = px.line(messages_per_day, x="date_only", y="count", title="Messages per Day")
29
 
30
- # --- Plot 2: Messages by Author ---
31
  top_authors = df['author'].value_counts().nlargest(20).reset_index()
32
  top_authors.columns = ['author', 'count']
33
  fig2 = px.bar(top_authors, x='author', y='count', title="Top 20 Authors", text='count')
34
  fig2.update_layout(xaxis_tickangle=-45)
35
 
36
- # Optional: delete file after processing
 
 
 
 
 
 
 
 
 
37
  os.remove(file.name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- return "Success", fig1, fig2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  # Gradio interface
42
- interface = gr.Interface(
43
- fn=process_file,
44
- inputs=gr.File(label="Upload CSV or XLSX", file_types=[".csv", ".xls", ".xlsx"]),
45
- outputs=[
46
- gr.Textbox(label="Status"),
47
- gr.Plot(label="Messages per Day"),
48
- gr.Plot(label="Top Authors"),
49
- ],
50
- title="Message Analyzer",
51
- description="Upload a CSV or XLSX file with 'timestamp' (Unix) and 'author' columns."
52
- )
 
 
 
 
 
 
 
53
 
54
  if __name__ == "__main__":
55
- interface.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
  import plotly.express as px
4
+ import networkx as nx
5
+ from pyvis.network import Network
6
+ import tempfile
7
  import os
8
 
9
  def process_file(file):
 
10
  file_ext = os.path.splitext(file.name)[1].lower()
 
 
11
  if file_ext == '.csv':
12
  df = pd.read_csv(file.name)
13
  elif file_ext in ['.xls', '.xlsx']:
14
  df = pd.read_excel(file.name)
15
  else:
16
+ return "Unsupported file format", None, None, None, None
17
 
18
+ required_cols = ['timestamp', 'author', 'text', 'id', 'parent']
19
+ missing = [col for col in required_cols if col not in df.columns]
20
+ if missing:
21
+ return f"Missing columns: {', '.join(missing)}", None, None, None, None
22
 
 
23
  df['date'] = pd.to_datetime(df['timestamp'], unit='s')
 
 
24
  df['date_only'] = df['date'].dt.date
25
+
26
+ # Messages per Day
27
  messages_per_day = df.groupby("date_only").size().reset_index(name="count")
28
  fig1 = px.line(messages_per_day, x="date_only", y="count", title="Messages per Day")
29
 
30
+ # Top Authors
31
  top_authors = df['author'].value_counts().nlargest(20).reset_index()
32
  top_authors.columns = ['author', 'count']
33
  fig2 = px.bar(top_authors, x='author', y='count', title="Top 20 Authors", text='count')
34
  fig2.update_layout(xaxis_tickangle=-45)
35
 
36
+ # Timeline of Comments
37
+ fig3 = px.scatter(df, x='date', y=[0]*len(df),
38
+ hover_data=['author', 'text', 'like_count'],
39
+ title="Comments Over Time", labels={'y': ''})
40
+ fig3.update_traces(marker=dict(size=6, opacity=0.5))
41
+ fig3.update_layout(yaxis=dict(showticklabels=False))
42
+
43
+ # Save to CSV for keyword search
44
+ df.to_csv("latest_data.csv", index=False)
45
+
46
  os.remove(file.name)
47
+ return "Success", fig1, fig2, fig3, build_network_html(df)
48
+
49
+ def build_network_html(df):
50
+ G = nx.DiGraph()
51
+
52
+ for _, row in df.iterrows():
53
+ author = str(row['author'])
54
+ comment_id = str(row['id'])
55
+ parent_id = row['parent']
56
+
57
+ # Add node for comment
58
+ G.add_node(comment_id, label=author)
59
+
60
+ # Add edge from author to comment
61
+ G.add_edge(author, comment_id)
62
+
63
+ # Add edge from parent to this comment (thread link)
64
+ if pd.notna(parent_id):
65
+ G.add_edge(str(parent_id), comment_id)
66
 
67
+ net = Network(height="400px", width="100%", notebook=False, directed=True)
68
+ net.from_nx(G)
69
+
70
+ tmp_dir = tempfile.mkdtemp()
71
+ html_path = os.path.join(tmp_dir, "net.html")
72
+ net.show(html_path)
73
+
74
+ with open(html_path, "r", encoding="utf-8") as f:
75
+ html_content = f.read()
76
+
77
+ return html_content
78
+
79
+ def search_keyword(keyword):
80
+ if not os.path.exists("latest_data.csv"):
81
+ return pd.DataFrame(columns=['timestamp', 'author', 'text'])
82
+
83
+ df = pd.read_csv("latest_data.csv")
84
+
85
+ if 'text' not in df.columns:
86
+ return pd.DataFrame(columns=['timestamp', 'author', 'text'])
87
+
88
+ mask = df['text'].astype(str).str.contains(keyword, case=False, na=False)
89
+ result = df.loc[mask, ['timestamp', 'author', 'text', 'like_count']].head(100)
90
+ result['date'] = pd.to_datetime(result['timestamp'], unit='s')
91
+ return result[['date', 'author', 'like_count', 'text']]
92
 
93
  # Gradio interface
94
+ with gr.Blocks() as demo:
95
+ gr.Markdown("## 🧠 Comment Thread Analyzer with Timeline + Search")
96
+ file_input = gr.File(label="πŸ“ Upload CSV or XLSX", file_types=[".csv", ".xls", ".xlsx"])
97
+
98
+ status = gr.Textbox(label="βœ… Status")
99
+ plot1 = gr.Plot(label="πŸ“ˆ Messages per Day")
100
+ plot2 = gr.Plot(label="πŸ‘€ Top 20 Authors")
101
+ timeline = gr.Plot(label="πŸ•’ Comment Timeline")
102
+ network_html = gr.HTML(label="🧡 Thread Network")
103
+
104
+ with gr.Row():
105
+ keyword_input = gr.Textbox(label="πŸ” Search Keyword in Comments")
106
+ search_button = gr.Button("Search")
107
+
108
+ search_results = gr.Dataframe(headers=["date", "author", "like_count", "text"], label="πŸ” Search Results")
109
+
110
+ file_input.change(fn=process_file, inputs=file_input, outputs=[status, plot1, plot2, timeline, network_html])
111
+ search_button.click(fn=search_keyword, inputs=keyword_input, outputs=search_results)
112
 
113
  if __name__ == "__main__":
114
+ demo.launch()