File size: 5,859 Bytes
5936ccc
 
 
4beb0c5
d308445
58b3135
 
 
113fac7
b8d5f44
113fac7
b8d5f44
113fac7
b8d5f44
113fac7
b8d5f44
ea55efd
d308445
 
113fac7
b8d5f44
ea55efd
5e80d8d
58b3135
d308445
 
58b3135
 
 
d308445
ea55efd
58b3135
 
 
 
113fac7
ea55efd
 
113fac7
ba82511
8bd3a54
 
113fac7
 
8bd3a54
 
 
 
 
d308445
 
 
 
113fac7
 
8bd3a54
113fac7
d308445
4beb0c5
d308445
 
ea55efd
 
 
d308445
 
 
 
 
58b3135
4beb0c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113fac7
4beb0c5
 
 
 
113fac7
 
4beb0c5
d308445
 
 
 
113fac7
d308445
 
 
ea55efd
d308445
 
 
 
ea55efd
d308445
 
ea55efd
 
c442b00
ea55efd
58b3135
 
d308445
ea55efd
 
d308445
 
 
 
 
 
113fac7
d308445
 
 
 
113fac7
ea55efd
d308445
113fac7
 
 
 
 
d308445
58b3135
 
d308445
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
import os

def process_file(file):
    file_ext = os.path.splitext(file.name)[1].lower()
    if file_ext == '.csv':
        df = pd.read_csv(file.name)
    elif file_ext in ['.xls', '.xlsx']:
        df = pd.read_excel(file.name)
    else:
        return "Unsupported file format", None, None, None, None, None

    required_cols = ['comment_id', 'text', 'like_count', 'author_name', 'author_channel_id', 'published_at', 'parent_id']
    missing = [col for col in required_cols if col not in df.columns]
    if missing:
        return f"Missing columns: {', '.join(missing)}", None, None, None, None, None

    df['date'] = pd.to_datetime(df['published_at'], errors='coerce')
    df = df.dropna(subset=['date'])
    df['date_only'] = df['date'].dt.date

    # Messages per Day
    messages_per_day = df.groupby("date_only").size().reset_index(name="count")
    fig1 = px.line(messages_per_day, x="date_only", y="count", title="Messages per Day")

    # Top Authors
    top_authors = df['author_name'].value_counts().nlargest(20).reset_index()
    top_authors.columns = ['author', 'count']
    fig2 = px.bar(top_authors, x='author', y='count', title="Top 20 Authors", text='count')
    fig2.update_layout(xaxis_tickangle=-45)

    # Timeline of Comments (like_count if exists else 0)
    y_data = df['like_count'].fillna(0)
    hover_cols = ['author_name', 'text', 'like_count']

    fig3 = px.scatter(
        df,
        x='date',
        y=y_data,
        hover_data=hover_cols,
        title="Comments Over Time (Likes)",
        labels={'like_count': 'Like Count', 'date': 'Date'}
    )
    fig3.update_traces(marker=dict(size=6, opacity=0.7))
    fig3.update_layout(yaxis=dict(title='Like Count'))

    # Save to CSV for keyword search
    df.to_csv("latest_data.csv", index=False)

    # Build network HTML and save permanently
    network_html_content, network_path = build_network_html_plotly(df)

    return "Success", fig1, fig2, fig3, network_html_content, network_path

def build_network_html_plotly(df):
    G = nx.DiGraph()
    for _, row in df.iterrows():
        author = str(row['author_name'])
        comment_id = str(row['comment_id'])
        parent_id = row['parent_id']

        G.add_node(comment_id, label=author)
        G.add_edge(author, comment_id)
        if pd.notna(parent_id):
            G.add_edge(str(parent_id), comment_id)

    pos = nx.spring_layout(G, seed=42)

    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    node_x = []
    node_y = []
    text = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        text.append(G.nodes[node].get('label', node))

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=text,
        textposition="top center",
        hoverinfo='text',
        marker=dict(
            showscale=False,
            color='LightSkyBlue',
            size=10,
            line_width=2))

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='Comment Thread Network',
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )

    # Save to permanent HTML file
    html_path = "network.html"
    fig.write_html(html_path)

    with open(html_path, "r", encoding="utf-8") as f:
        html_content = f.read()

    return html_content, html_path

def search_keyword(keyword):
    if not os.path.exists("latest_data.csv"):
        return pd.DataFrame(columns=['date', 'author_name', 'like_count', 'text'])

    df = pd.read_csv("latest_data.csv")

    if 'text' not in df.columns:
        return pd.DataFrame(columns=['date', 'author_name', 'like_count', 'text'])

    mask = df['text'].astype(str).str.contains(keyword, case=False, na=False)
    result = df.loc[mask, ['published_at', 'author_name', 'like_count', 'text']]
    result['date'] = pd.to_datetime(result['published_at'], errors='coerce')
    result = result.dropna(subset=['date'])
    return result[['date', 'author_name', 'like_count', 'text']].head(100)

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 YouTube Comment Thread Analyzer with Timeline + Search")
    file_input = gr.File(label="πŸ“ Upload CSV or XLSX (YouTube API v3 format)")

    status = gr.Textbox(label="βœ… Status")
    plot1 = gr.Plot(label="πŸ“ˆ Messages per Day")
    plot2 = gr.Plot(label="πŸ‘€ Top 20 Authors")
    timeline = gr.Plot(label="πŸ•’ Comment Timeline")
    network_html = gr.HTML(label="🧡 Thread Network")
    download_network = gr.File(label="⬇️ Download Network HTML", interactive=False)

    with gr.Row():
        keyword_input = gr.Textbox(label="πŸ” Search Keyword in Comments")
        search_button = gr.Button("Search")

    search_results = gr.Dataframe(headers=["date", "author_name", "like_count", "text"], label="πŸ” Search Results")

    file_input.change(
        fn=process_file,
        inputs=file_input,
        outputs=[status, plot1, plot2, timeline, network_html, download_network]
    )
    search_button.click(fn=search_keyword, inputs=keyword_input, outputs=search_results)

if __name__ == "__main__":
    demo.launch()