Spaces:

clementBE
/

YT_Comments_explorer

Sleeping

App Files Files Community

clementBE commited on Jul 11, 2025

Commit

d308445

verified ·

1 Parent(s): 70a4e98

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -25

app.py CHANGED Viewed

@@ -1,55 +1,114 @@
 import gradio as gr
 import pandas as pd
 import plotly.express as px
 import os
 def process_file(file):
-    # Determine file extension
     file_ext = os.path.splitext(file.name)[1].lower()
-    # Load file accordingly
     if file_ext == '.csv':
         df = pd.read_csv(file.name)
     elif file_ext in ['.xls', '.xlsx']:
         df = pd.read_excel(file.name)
     else:
-        return "Unsupported file format", None, None
-    # Ensure 'timestamp' column exists
-    if 'timestamp' not in df.columns or 'author' not in df.columns:
-        return "The file must contain 'timestamp' and 'author' columns.", None, None
-    # Convert Unix timestamp to datetime
     df['date'] = pd.to_datetime(df['timestamp'], unit='s')
-    # --- Plot 1: Messages per Day ---
     df['date_only'] = df['date'].dt.date
     messages_per_day = df.groupby("date_only").size().reset_index(name="count")
     fig1 = px.line(messages_per_day, x="date_only", y="count", title="Messages per Day")
-    # --- Plot 2: Messages by Author ---
     top_authors = df['author'].value_counts().nlargest(20).reset_index()
     top_authors.columns = ['author', 'count']
     fig2 = px.bar(top_authors, x='author', y='count', title="Top 20 Authors", text='count')
     fig2.update_layout(xaxis_tickangle=-45)
-    # Optional: delete file after processing
     os.remove(file.name)
-    return "Success", fig1, fig2
 # Gradio interface
-interface = gr.Interface(
-    fn=process_file,
-    inputs=gr.File(label="Upload CSV or XLSX", file_types=[".csv", ".xls", ".xlsx"]),
-    outputs=[
-        gr.Textbox(label="Status"),
-        gr.Plot(label="Messages per Day"),
-        gr.Plot(label="Top Authors"),
-    ],
-    title="Message Analyzer",
-    description="Upload a CSV or XLSX file with 'timestamp' (Unix) and 'author' columns."
-)
 if __name__ == "__main__":
-    interface.launch()

 import gradio as gr
 import pandas as pd
 import plotly.express as px
+import networkx as nx
+from pyvis.network import Network
+import tempfile
 import os
 def process_file(file):
     file_ext = os.path.splitext(file.name)[1].lower()
     if file_ext == '.csv':
         df = pd.read_csv(file.name)
     elif file_ext in ['.xls', '.xlsx']:
         df = pd.read_excel(file.name)
     else:
+        return "Unsupported file format", None, None, None, None
+    required_cols = ['timestamp', 'author', 'text', 'id', 'parent']
+    missing = [col for col in required_cols if col not in df.columns]
+    if missing:
+        return f"Missing columns: {', '.join(missing)}", None, None, None, None
     df['date'] = pd.to_datetime(df['timestamp'], unit='s')
     df['date_only'] = df['date'].dt.date
+    # Messages per Day
     messages_per_day = df.groupby("date_only").size().reset_index(name="count")
     fig1 = px.line(messages_per_day, x="date_only", y="count", title="Messages per Day")
+    # Top Authors
     top_authors = df['author'].value_counts().nlargest(20).reset_index()
     top_authors.columns = ['author', 'count']
     fig2 = px.bar(top_authors, x='author', y='count', title="Top 20 Authors", text='count')
     fig2.update_layout(xaxis_tickangle=-45)
+    # Timeline of Comments
+    fig3 = px.scatter(df, x='date', y=[0]*len(df),
+                      hover_data=['author', 'text', 'like_count'],
+                      title="Comments Over Time", labels={'y': ''})
+    fig3.update_traces(marker=dict(size=6, opacity=0.5))
+    fig3.update_layout(yaxis=dict(showticklabels=False))
+    # Save to CSV for keyword search
+    df.to_csv("latest_data.csv", index=False)
     os.remove(file.name)
+    return "Success", fig1, fig2, fig3, build_network_html(df)
+def build_network_html(df):
+    G = nx.DiGraph()
+    for _, row in df.iterrows():
+        author = str(row['author'])
+        comment_id = str(row['id'])
+        parent_id = row['parent']
+        # Add node for comment
+        G.add_node(comment_id, label=author)
+        # Add edge from author to comment
+        G.add_edge(author, comment_id)
+        # Add edge from parent to this comment (thread link)
+        if pd.notna(parent_id):
+            G.add_edge(str(parent_id), comment_id)
+    net = Network(height="400px", width="100%", notebook=False, directed=True)
+    net.from_nx(G)
+    tmp_dir = tempfile.mkdtemp()
+    html_path = os.path.join(tmp_dir, "net.html")
+    net.show(html_path)
+    with open(html_path, "r", encoding="utf-8") as f:
+        html_content = f.read()
+    return html_content
+def search_keyword(keyword):
+    if not os.path.exists("latest_data.csv"):
+        return pd.DataFrame(columns=['timestamp', 'author', 'text'])
+    df = pd.read_csv("latest_data.csv")
+    if 'text' not in df.columns:
+        return pd.DataFrame(columns=['timestamp', 'author', 'text'])
+    mask = df['text'].astype(str).str.contains(keyword, case=False, na=False)
+    result = df.loc[mask, ['timestamp', 'author', 'text', 'like_count']].head(100)
+    result['date'] = pd.to_datetime(result['timestamp'], unit='s')
+    return result[['date', 'author', 'like_count', 'text']]
 # Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("## 🧠 Comment Thread Analyzer with Timeline + Search")
+    file_input = gr.File(label="📁 Upload CSV or XLSX", file_types=[".csv", ".xls", ".xlsx"])
+    status = gr.Textbox(label="✅ Status")
+    plot1 = gr.Plot(label="📈 Messages per Day")
+    plot2 = gr.Plot(label="👤 Top 20 Authors")
+    timeline = gr.Plot(label="🕒 Comment Timeline")
+    network_html = gr.HTML(label="🧵 Thread Network")
+    with gr.Row():
+        keyword_input = gr.Textbox(label="🔍 Search Keyword in Comments")
+        search_button = gr.Button("Search")
+    search_results = gr.Dataframe(headers=["date", "author", "like_count", "text"], label="🔍 Search Results")
+    file_input.change(fn=process_file, inputs=file_input, outputs=[status, plot1, plot2, timeline, network_html])
+    search_button.click(fn=search_keyword, inputs=keyword_input, outputs=search_results)
 if __name__ == "__main__":
+    demo.launch()