Spaces:

clementBE
/

YT_Comments_explorer

Sleeping

File size: 5,859 Bytes

5936ccc
 
 
4beb0c5
d308445
58b3135
 
 
113fac7
b8d5f44
113fac7
b8d5f44
113fac7
b8d5f44
113fac7
b8d5f44
ea55efd
d308445
 
113fac7
b8d5f44
ea55efd
5e80d8d
58b3135
d308445
 
58b3135
 
 
d308445
ea55efd
58b3135
 
 
 
113fac7
ea55efd
 
113fac7
ba82511
8bd3a54
 
113fac7
 
8bd3a54
 
 
 
 
d308445
 
 
 
113fac7
 
8bd3a54
113fac7
d308445
4beb0c5
d308445
 
ea55efd
 
 
d308445
 
 
 
 
58b3135
4beb0c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113fac7
4beb0c5
 
 
 
113fac7
 
4beb0c5
d308445
 
 
 
113fac7
d308445
 
 
ea55efd
d308445
 
 
 
ea55efd
d308445
 
ea55efd
 
c442b00
ea55efd
58b3135
 
d308445
ea55efd
 
d308445
 
 
 
 
 
113fac7
d308445
 
 
 
113fac7
ea55efd
d308445
113fac7
 
 
 
 
d308445
58b3135
 
d308445

import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
import os

def process_file(file):
    file_ext = os.path.splitext(file.name)[1].lower()
    if file_ext == '.csv':
        df = pd.read_csv(file.name)
    elif file_ext in ['.xls', '.xlsx']:
        df = pd.read_excel(file.name)
    else:
        return "Unsupported file format", None, None, None, None, None

    required_cols = ['comment_id', 'text', 'like_count', 'author_name', 'author_channel_id', 'published_at', 'parent_id']
    missing = [col for col in required_cols if col not in df.columns]
    if missing:
        return f"Missing columns: {', '.join(missing)}", None, None, None, None, None

    df['date'] = pd.to_datetime(df['published_at'], errors='coerce')
    df = df.dropna(subset=['date'])
    df['date_only'] = df['date'].dt.date

    # Messages per Day
    messages_per_day = df.groupby("date_only").size().reset_index(name="count")
    fig1 = px.line(messages_per_day, x="date_only", y="count", title="Messages per Day")

    # Top Authors
    top_authors = df['author_name'].value_counts().nlargest(20).reset_index()
    top_authors.columns = ['author', 'count']
    fig2 = px.bar(top_authors, x='author', y='count', title="Top 20 Authors", text='count')
    fig2.update_layout(xaxis_tickangle=-45)

    # Timeline of Comments (like_count if exists else 0)
    y_data = df['like_count'].fillna(0)
    hover_cols = ['author_name', 'text', 'like_count']

    fig3 = px.scatter(
        df,
        x='date',
        y=y_data,
        hover_data=hover_cols,
        title="Comments Over Time (Likes)",
        labels={'like_count': 'Like Count', 'date': 'Date'}
    )
    fig3.update_traces(marker=dict(size=6, opacity=0.7))
    fig3.update_layout(yaxis=dict(title='Like Count'))

    # Save to CSV for keyword search
    df.to_csv("latest_data.csv", index=False)

    # Build network HTML and save permanently
    network_html_content, network_path = build_network_html_plotly(df)

    return "Success", fig1, fig2, fig3, network_html_content, network_path

def build_network_html_plotly(df):
    G = nx.DiGraph()
    for _, row in df.iterrows():
        author = str(row['author_name'])
        comment_id = str(row['comment_id'])
        parent_id = row['parent_id']

        G.add_node(comment_id, label=author)
        G.add_edge(author, comment_id)
        if pd.notna(parent_id):
            G.add_edge(str(parent_id), comment_id)

    pos = nx.spring_layout(G, seed=42)

    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    node_x = []
    node_y = []
    text = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        text.append(G.nodes[node].get('label', node))

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=text,
        textposition="top center",
        hoverinfo='text',
        marker=dict(
            showscale=False,
            color='LightSkyBlue',
            size=10,
            line_width=2))

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title='Comment Thread Network',
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=40),
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )

    # Save to permanent HTML file
    html_path = "network.html"
    fig.write_html(html_path)

    with open(html_path, "r", encoding="utf-8") as f:
        html_content = f.read()

    return html_content, html_path

def search_keyword(keyword):
    if not os.path.exists("latest_data.csv"):
        return pd.DataFrame(columns=['date', 'author_name', 'like_count', 'text'])

    df = pd.read_csv("latest_data.csv")

    if 'text' not in df.columns:
        return pd.DataFrame(columns=['date', 'author_name', 'like_count', 'text'])

    mask = df['text'].astype(str).str.contains(keyword, case=False, na=False)
    result = df.loc[mask, ['published_at', 'author_name', 'like_count', 'text']]
    result['date'] = pd.to_datetime(result['published_at'], errors='coerce')
    result = result.dropna(subset=['date'])
    return result[['date', 'author_name', 'like_count', 'text']].head(100)

# Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 YouTube Comment Thread Analyzer with Timeline + Search")
    file_input = gr.File(label="📁 Upload CSV or XLSX (YouTube API v3 format)")

    status = gr.Textbox(label="✅ Status")
    plot1 = gr.Plot(label="📈 Messages per Day")
    plot2 = gr.Plot(label="👤 Top 20 Authors")
    timeline = gr.Plot(label="🕒 Comment Timeline")
    network_html = gr.HTML(label="🧵 Thread Network")
    download_network = gr.File(label="⬇️ Download Network HTML", interactive=False)

    with gr.Row():
        keyword_input = gr.Textbox(label="🔍 Search Keyword in Comments")
        search_button = gr.Button("Search")

    search_results = gr.Dataframe(headers=["date", "author_name", "like_count", "text"], label="🔍 Search Results")

    file_input.change(
        fn=process_file,
        inputs=file_input,
        outputs=[status, plot1, plot2, timeline, network_html, download_network]
    )
    search_button.click(fn=search_keyword, inputs=keyword_input, outputs=search_results)

if __name__ == "__main__":
    demo.launch()