Spaces:

blazingbunny
/

cluster-vis-marimo

Runtime error

App Files Files Community

blazingbunny commited on Aug 8, 2025

Commit

40e2e7e

verified ·

1 Parent(s): 2dff648

Create script.py

Browse files

Files changed (1) hide show

script.py +168 -0

script.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import sqlite3
+import pandas as pd
+import numpy as np
+import networkx as nx
+from networkx.algorithms import community
+import matplotlib.pyplot as plt
+import random
+import time
+from datetime import datetime
+from googleapiclient.discovery import build
+import langid
+from tld import get_tld
+from fuzzywuzzy import fuzz
+# ---------------------------
+# Utility Functions
+# ---------------------------
+def language_detection(text):
+    return langid.classify(text)[0]
+def extract_mainDomain(url):
+    try:
+        res = get_tld(url, as_object=True)
+        return res.fld
+    except Exception:
+        return ""
+def fuzzy_ratio(str1, str2):
+    return fuzz.ratio(str1, str2)
+def fuzzy_token_set_ratio(str1, str2):
+    return fuzz.token_set_ratio(str1, str2)
+# ---------------------------
+# Google Custom Search
+# ---------------------------
+def google_search(query, api_key, cse_id, hl, gl):
+    try:
+        service = build("customsearch", "v1", developerKey=api_key, cache_discovery=False)
+        res = service.cse().list(
+            q=query, hl=hl, gl=gl, cx=cse_id,
+            fields='queries(request(totalResults,searchTerms,hl,gl)),items(title,displayLink,link,snippet)',
+            num=10
+        ).execute()
+        time.sleep(1)
+        return res
+    except Exception as e:
+        print("Search error:", e)
+        return None
+# ---------------------------
+# Fetch and Store Search Results
+# ---------------------------
+def getSearchResult(keywords, hl, gl, api_key, cse_id, database, table):
+    timestamp = datetime.now()
+    rows = []
+    for query in keywords:
+        result = google_search(query, api_key, cse_id, hl, gl)
+        if result and "items" in result:
+            for i, item in enumerate(result["items"]):
+                snippet = item.get("snippet", "")
+                title = item.get("title", "")
+                rows.append({
+                    "requestTimestamp": timestamp,
+                    "searchTerms": query,
+                    "gl": gl,
+                    "hl": hl,
+                    "totalResults": result["queries"]["request"][0]["totalResults"],
+                    "link": item["link"],
+                    "displayLink": item["displayLink"],
+                    "main_domain": extract_mainDomain(item["link"]),
+                    "position": i + 1,
+                    "snippet": snippet,
+                    "snipped_language": language_detection(snippet),
+                    "snippet_matchScore_order": fuzzy_ratio(snippet, query),
+                    "snippet_matchScore_token": fuzzy_token_set_ratio(snippet, query),
+                    "title": title,
+                    "title_matchScore_order": fuzzy_ratio(title, query),
+                    "title_matchScore_token": fuzzy_token_set_ratio(title, query),
+                })
+    df = pd.DataFrame(rows)
+    with sqlite3.connect(database) as conn:
+        df.to_sql(table, index=False, if_exists="append", dtype={"requestTimestamp": "DateTime"})
+# ---------------------------
+# Cluster Graphs
+# ---------------------------
+def com_postion(n, scale=1, center=(0, 0)):
+    theta = np.linspace(0, 2 * np.pi, n, endpoint=False)
+    pos = np.column_stack((np.cos(theta), np.sin(theta)))
+    return scale * pos + np.array(center)
+def node_postion(nodes, scale=1, center=(0, 0)):
+    n = len(nodes)
+    theta = np.linspace(0, 2 * np.pi, n, endpoint=False)
+    pos = np.column_stack((np.cos(theta), np.sin(theta)))
+    return dict(zip(nodes, scale * pos + np.array(center)))
+def getClustersWithGraph(database, serp_table, timestamp="max"):
+    with sqlite3.connect(database) as conn:
+        if timestamp == "max":
+            query = f'''
+                SELECT * FROM {serp_table}
+                WHERE requestTimestamp = (SELECT MAX(requestTimestamp) FROM {serp_table})
+            '''
+        else:
+            query = f'''
+                SELECT * FROM {serp_table}
+                WHERE requestTimestamp = "{timestamp}"
+            '''
+        df = pd.read_sql(query, conn)
+    G = nx.Graph()
+    G.add_nodes_from(df["searchTerms"])
+    for _, row in df.iterrows():
+        for _, r2 in df[df["link"] == row["link"]].iterrows():
+            if row["searchTerms"] != r2["searchTerms"]:
+                G.add_edge(row["searchTerms"], r2["searchTerms"])
+    communities = community.greedy_modularity_communities(G)
+    degrees = dict(G.degree())
+    colors = ["#" + ''.join(random.choices('0123456789ABCDEF', k=6)) for _ in communities]
+    pos = {}
+    centers = com_postion(len(communities), scale=3)
+    for i, group in enumerate(communities):
+        pos.update(node_postion(list(group), scale=0.8, center=centers[i]))
+    fig, ax = plt.subplots(figsize=(12, 8), dpi=100)
+    nx.draw(G, pos, with_labels=True, ax=ax, node_size=10, font_size=8, edge_color='gray', alpha=0.2)
+    for i, group in enumerate(communities):
+        nx.draw_networkx_nodes(
+            G, pos, nodelist=list(group), node_color=colors[i],
+            node_size=[degrees[n] * 10 for n in group], ax=ax
+        )
+    ax.axis('off')
+    # Return cluster assignments
+    cluster_rows = []
+    for i, group in enumerate(communities):
+        for kw in group:
+            cluster_rows.append({
+                "searchTerms": kw,
+                "cluster": i,
+                "requestTimestamp": timestamp
+            })
+    df_clusters = pd.DataFrame(cluster_rows)
+    return fig, df_clusters
+# ---------------------------
+# Compare Clusters
+# ---------------------------
+def compare_clusters(df1, df2):
+    merged = pd.merge(df1, df2, on=\"searchTerms\", suffixes=(\"_1\", \"_2\"))
+    moved = merged[merged[\"cluster_1\"] != merged[\"cluster_2\"]]
+    return moved