clementBE commited on
Commit
ea55efd
Β·
verified Β·
1 Parent(s): 113fac7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -22
app.py CHANGED
@@ -14,12 +14,12 @@ def process_file(file):
14
  else:
15
  return "Unsupported file format", None, None, None, None, None
16
 
17
- required_cols = ['timestamp', 'author', 'text', 'id', 'parent']
18
  missing = [col for col in required_cols if col not in df.columns]
19
  if missing:
20
  return f"Missing columns: {', '.join(missing)}", None, None, None, None, None
21
 
22
- df['date'] = pd.to_datetime(df['timestamp'], unit='s', errors='coerce')
23
  df = df.dropna(subset=['date'])
24
  df['date_only'] = df['date'].dt.date
25
 
@@ -28,16 +28,14 @@ def process_file(file):
28
  fig1 = px.line(messages_per_day, x="date_only", y="count", title="Messages per Day")
29
 
30
  # Top Authors
31
- top_authors = df['author'].value_counts().nlargest(20).reset_index()
32
  top_authors.columns = ['author', 'count']
33
  fig2 = px.bar(top_authors, x='author', y='count', title="Top 20 Authors", text='count')
34
  fig2.update_layout(xaxis_tickangle=-45)
35
 
36
  # Timeline of Comments (like_count if exists else 0)
37
- y_data = df['like_count'] if 'like_count' in df.columns else [0]*len(df)
38
- hover_cols = ['author', 'text']
39
- if 'like_count' in df.columns:
40
- hover_cols.append('like_count')
41
 
42
  fig3 = px.scatter(
43
  df,
@@ -61,9 +59,9 @@ def process_file(file):
61
  def build_network_html_plotly(df):
62
  G = nx.DiGraph()
63
  for _, row in df.iterrows():
64
- author = str(row['author'])
65
- comment_id = str(row['id'])
66
- parent_id = row['parent']
67
 
68
  G.add_node(comment_id, label=author)
69
  G.add_edge(author, comment_id)
@@ -128,27 +126,23 @@ def build_network_html_plotly(df):
128
 
129
  def search_keyword(keyword):
130
  if not os.path.exists("latest_data.csv"):
131
- return pd.DataFrame(columns=['date', 'author', 'like_count', 'text'])
132
 
133
  df = pd.read_csv("latest_data.csv")
134
 
135
  if 'text' not in df.columns:
136
- return pd.DataFrame(columns=['date', 'author', 'like_count', 'text'])
137
 
138
  mask = df['text'].astype(str).str.contains(keyword, case=False, na=False)
139
- result = df.loc[mask, ['timestamp', 'author', 'text']]
140
- if 'like_count' in df.columns:
141
- result['like_count'] = df.loc[mask, 'like_count']
142
- else:
143
- result['like_count'] = None
144
- result['date'] = pd.to_datetime(result['timestamp'], unit='s', errors='coerce')
145
  result = result.dropna(subset=['date'])
146
- return result[['date', 'author', 'like_count', 'text']].head(100)
147
 
148
  # Gradio interface
149
  with gr.Blocks() as demo:
150
- gr.Markdown("## 🧠 Comment Thread Analyzer with Timeline + Search")
151
- file_input = gr.File(label="πŸ“ Upload CSV or XLSX", file_types=[".csv", ".xls", ".xlsx"])
152
 
153
  status = gr.Textbox(label="βœ… Status")
154
  plot1 = gr.Plot(label="πŸ“ˆ Messages per Day")
@@ -161,7 +155,7 @@ with gr.Blocks() as demo:
161
  keyword_input = gr.Textbox(label="πŸ” Search Keyword in Comments")
162
  search_button = gr.Button("Search")
163
 
164
- search_results = gr.Dataframe(headers=["date", "author", "like_count", "text"], label="πŸ” Search Results")
165
 
166
  file_input.change(
167
  fn=process_file,
 
14
  else:
15
  return "Unsupported file format", None, None, None, None, None
16
 
17
+ required_cols = ['comment_id', 'text', 'like_count', 'author_name', 'author_channel_id', 'published_at', 'parent_id']
18
  missing = [col for col in required_cols if col not in df.columns]
19
  if missing:
20
  return f"Missing columns: {', '.join(missing)}", None, None, None, None, None
21
 
22
+ df['date'] = pd.to_datetime(df['published_at'], errors='coerce')
23
  df = df.dropna(subset=['date'])
24
  df['date_only'] = df['date'].dt.date
25
 
 
28
  fig1 = px.line(messages_per_day, x="date_only", y="count", title="Messages per Day")
29
 
30
  # Top Authors
31
+ top_authors = df['author_name'].value_counts().nlargest(20).reset_index()
32
  top_authors.columns = ['author', 'count']
33
  fig2 = px.bar(top_authors, x='author', y='count', title="Top 20 Authors", text='count')
34
  fig2.update_layout(xaxis_tickangle=-45)
35
 
36
  # Timeline of Comments (like_count if exists else 0)
37
+ y_data = df['like_count'].fillna(0)
38
+ hover_cols = ['author_name', 'text', 'like_count']
 
 
39
 
40
  fig3 = px.scatter(
41
  df,
 
59
  def build_network_html_plotly(df):
60
  G = nx.DiGraph()
61
  for _, row in df.iterrows():
62
+ author = str(row['author_name'])
63
+ comment_id = str(row['comment_id'])
64
+ parent_id = row['parent_id']
65
 
66
  G.add_node(comment_id, label=author)
67
  G.add_edge(author, comment_id)
 
126
 
127
  def search_keyword(keyword):
128
  if not os.path.exists("latest_data.csv"):
129
+ return pd.DataFrame(columns=['date', 'author_name', 'like_count', 'text'])
130
 
131
  df = pd.read_csv("latest_data.csv")
132
 
133
  if 'text' not in df.columns:
134
+ return pd.DataFrame(columns=['date', 'author_name', 'like_count', 'text'])
135
 
136
  mask = df['text'].astype(str).str.contains(keyword, case=False, na=False)
137
+ result = df.loc[mask, ['published_at', 'author_name', 'like_count', 'text']]
138
+ result['date'] = pd.to_datetime(result['published_at'], errors='coerce')
 
 
 
 
139
  result = result.dropna(subset=['date'])
140
+ return result[['date', 'author_name', 'like_count', 'text']].head(100)
141
 
142
  # Gradio interface
143
  with gr.Blocks() as demo:
144
+ gr.Markdown("## 🧠 YouTube Comment Thread Analyzer with Timeline + Search")
145
+ file_input = gr.File(label="πŸ“ Upload CSV or XLSX (YouTube API v3 format)")
146
 
147
  status = gr.Textbox(label="βœ… Status")
148
  plot1 = gr.Plot(label="πŸ“ˆ Messages per Day")
 
155
  keyword_input = gr.Textbox(label="πŸ” Search Keyword in Comments")
156
  search_button = gr.Button("Search")
157
 
158
+ search_results = gr.Dataframe(headers=["date", "author_name", "like_count", "text"], label="πŸ” Search Results")
159
 
160
  file_input.change(
161
  fn=process_file,