Spaces:
Sleeping
Sleeping
File size: 5,859 Bytes
5936ccc 4beb0c5 d308445 58b3135 113fac7 b8d5f44 113fac7 b8d5f44 113fac7 b8d5f44 113fac7 b8d5f44 ea55efd d308445 113fac7 b8d5f44 ea55efd 5e80d8d 58b3135 d308445 58b3135 d308445 ea55efd 58b3135 113fac7 ea55efd 113fac7 ba82511 8bd3a54 113fac7 8bd3a54 d308445 113fac7 8bd3a54 113fac7 d308445 4beb0c5 d308445 ea55efd d308445 58b3135 4beb0c5 113fac7 4beb0c5 113fac7 4beb0c5 d308445 113fac7 d308445 ea55efd d308445 ea55efd d308445 ea55efd c442b00 ea55efd 58b3135 d308445 ea55efd d308445 113fac7 d308445 113fac7 ea55efd d308445 113fac7 d308445 58b3135 d308445 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
import os
def process_file(file):
file_ext = os.path.splitext(file.name)[1].lower()
if file_ext == '.csv':
df = pd.read_csv(file.name)
elif file_ext in ['.xls', '.xlsx']:
df = pd.read_excel(file.name)
else:
return "Unsupported file format", None, None, None, None, None
required_cols = ['comment_id', 'text', 'like_count', 'author_name', 'author_channel_id', 'published_at', 'parent_id']
missing = [col for col in required_cols if col not in df.columns]
if missing:
return f"Missing columns: {', '.join(missing)}", None, None, None, None, None
df['date'] = pd.to_datetime(df['published_at'], errors='coerce')
df = df.dropna(subset=['date'])
df['date_only'] = df['date'].dt.date
# Messages per Day
messages_per_day = df.groupby("date_only").size().reset_index(name="count")
fig1 = px.line(messages_per_day, x="date_only", y="count", title="Messages per Day")
# Top Authors
top_authors = df['author_name'].value_counts().nlargest(20).reset_index()
top_authors.columns = ['author', 'count']
fig2 = px.bar(top_authors, x='author', y='count', title="Top 20 Authors", text='count')
fig2.update_layout(xaxis_tickangle=-45)
# Timeline of Comments (like_count if exists else 0)
y_data = df['like_count'].fillna(0)
hover_cols = ['author_name', 'text', 'like_count']
fig3 = px.scatter(
df,
x='date',
y=y_data,
hover_data=hover_cols,
title="Comments Over Time (Likes)",
labels={'like_count': 'Like Count', 'date': 'Date'}
)
fig3.update_traces(marker=dict(size=6, opacity=0.7))
fig3.update_layout(yaxis=dict(title='Like Count'))
# Save to CSV for keyword search
df.to_csv("latest_data.csv", index=False)
# Build network HTML and save permanently
network_html_content, network_path = build_network_html_plotly(df)
return "Success", fig1, fig2, fig3, network_html_content, network_path
def build_network_html_plotly(df):
G = nx.DiGraph()
for _, row in df.iterrows():
author = str(row['author_name'])
comment_id = str(row['comment_id'])
parent_id = row['parent_id']
G.add_node(comment_id, label=author)
G.add_edge(author, comment_id)
if pd.notna(parent_id):
G.add_edge(str(parent_id), comment_id)
pos = nx.spring_layout(G, seed=42)
edge_x = []
edge_y = []
for edge in G.edges():
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
edge_x += [x0, x1, None]
edge_y += [y0, y1, None]
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
line=dict(width=0.5, color='#888'),
hoverinfo='none',
mode='lines')
node_x = []
node_y = []
text = []
for node in G.nodes():
x, y = pos[node]
node_x.append(x)
node_y.append(y)
text.append(G.nodes[node].get('label', node))
node_trace = go.Scatter(
x=node_x, y=node_y,
mode='markers+text',
text=text,
textposition="top center",
hoverinfo='text',
marker=dict(
showscale=False,
color='LightSkyBlue',
size=10,
line_width=2))
fig = go.Figure(data=[edge_trace, node_trace],
layout=go.Layout(
title='Comment Thread Network',
showlegend=False,
hovermode='closest',
margin=dict(b=20, l=5, r=5, t=40),
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
)
# Save to permanent HTML file
html_path = "network.html"
fig.write_html(html_path)
with open(html_path, "r", encoding="utf-8") as f:
html_content = f.read()
return html_content, html_path
def search_keyword(keyword):
if not os.path.exists("latest_data.csv"):
return pd.DataFrame(columns=['date', 'author_name', 'like_count', 'text'])
df = pd.read_csv("latest_data.csv")
if 'text' not in df.columns:
return pd.DataFrame(columns=['date', 'author_name', 'like_count', 'text'])
mask = df['text'].astype(str).str.contains(keyword, case=False, na=False)
result = df.loc[mask, ['published_at', 'author_name', 'like_count', 'text']]
result['date'] = pd.to_datetime(result['published_at'], errors='coerce')
result = result.dropna(subset=['date'])
return result[['date', 'author_name', 'like_count', 'text']].head(100)
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("## π§ YouTube Comment Thread Analyzer with Timeline + Search")
file_input = gr.File(label="π Upload CSV or XLSX (YouTube API v3 format)")
status = gr.Textbox(label="β
Status")
plot1 = gr.Plot(label="π Messages per Day")
plot2 = gr.Plot(label="π€ Top 20 Authors")
timeline = gr.Plot(label="π Comment Timeline")
network_html = gr.HTML(label="π§΅ Thread Network")
download_network = gr.File(label="β¬οΈ Download Network HTML", interactive=False)
with gr.Row():
keyword_input = gr.Textbox(label="π Search Keyword in Comments")
search_button = gr.Button("Search")
search_results = gr.Dataframe(headers=["date", "author_name", "like_count", "text"], label="π Search Results")
file_input.change(
fn=process_file,
inputs=file_input,
outputs=[status, plot1, plot2, timeline, network_html, download_network]
)
search_button.click(fn=search_keyword, inputs=keyword_input, outputs=search_results)
if __name__ == "__main__":
demo.launch()
|