Spaces:

blazingbunny
/

cluster-vis-marimo

Runtime error

App Files Files Community

cluster-vis-marimo / script.py

blazingbunny

Create script.py

40e2e7e verified 6 months ago

raw

history blame contribute delete

5.67 kB

	import sqlite3
	import pandas as pd
	import numpy as np
	import networkx as nx
	from networkx.algorithms import community
	import matplotlib.pyplot as plt
	import random
	import time
	from datetime import datetime
	from googleapiclient.discovery import build
	import langid
	from tld import get_tld
	from fuzzywuzzy import fuzz

	# ---------------------------
	# Utility Functions
	# ---------------------------

	def language_detection(text):
	return langid.classify(text)[0]

	def extract_mainDomain(url):
	try:
	res = get_tld(url, as_object=True)
	return res.fld
	except Exception:
	return ""

	def fuzzy_ratio(str1, str2):
	return fuzz.ratio(str1, str2)

	def fuzzy_token_set_ratio(str1, str2):
	return fuzz.token_set_ratio(str1, str2)

	# ---------------------------
	# Google Custom Search
	# ---------------------------

	def google_search(query, api_key, cse_id, hl, gl):
	try:
	service = build("customsearch", "v1", developerKey=api_key, cache_discovery=False)
	res = service.cse().list(
	q=query, hl=hl, gl=gl, cx=cse_id,
	fields='queries(request(totalResults,searchTerms,hl,gl)),items(title,displayLink,link,snippet)',
	num=10
	).execute()
	time.sleep(1)
	return res
	except Exception as e:
	print("Search error:", e)
	return None

	# ---------------------------
	# Fetch and Store Search Results
	# ---------------------------

	def getSearchResult(keywords, hl, gl, api_key, cse_id, database, table):
	timestamp = datetime.now()
	rows = []

	for query in keywords:
	result = google_search(query, api_key, cse_id, hl, gl)
	if result and "items" in result:
	for i, item in enumerate(result["items"]):
	snippet = item.get("snippet", "")
	title = item.get("title", "")

	rows.append({
	"requestTimestamp": timestamp,
	"searchTerms": query,
	"gl": gl,
	"hl": hl,
	"totalResults": result["queries"]["request"][0]["totalResults"],
	"link": item["link"],
	"displayLink": item["displayLink"],
	"main_domain": extract_mainDomain(item["link"]),
	"position": i + 1,
	"snippet": snippet,
	"snipped_language": language_detection(snippet),
	"snippet_matchScore_order": fuzzy_ratio(snippet, query),
	"snippet_matchScore_token": fuzzy_token_set_ratio(snippet, query),
	"title": title,
	"title_matchScore_order": fuzzy_ratio(title, query),
	"title_matchScore_token": fuzzy_token_set_ratio(title, query),
	})

	df = pd.DataFrame(rows)
	with sqlite3.connect(database) as conn:
	df.to_sql(table, index=False, if_exists="append", dtype={"requestTimestamp": "DateTime"})

	# ---------------------------
	# Cluster Graphs
	# ---------------------------

	def com_postion(n, scale=1, center=(0, 0)):
	theta = np.linspace(0, 2 * np.pi, n, endpoint=False)
	pos = np.column_stack((np.cos(theta), np.sin(theta)))
	return scale * pos + np.array(center)

	def node_postion(nodes, scale=1, center=(0, 0)):
	n = len(nodes)
	theta = np.linspace(0, 2 * np.pi, n, endpoint=False)
	pos = np.column_stack((np.cos(theta), np.sin(theta)))
	return dict(zip(nodes, scale * pos + np.array(center)))

	def getClustersWithGraph(database, serp_table, timestamp="max"):
	with sqlite3.connect(database) as conn:
	if timestamp == "max":
	query = f'''
	SELECT * FROM {serp_table}
	WHERE requestTimestamp = (SELECT MAX(requestTimestamp) FROM {serp_table})
	'''
	else:
	query = f'''
	SELECT * FROM {serp_table}
	WHERE requestTimestamp = "{timestamp}"
	'''
	df = pd.read_sql(query, conn)

	G = nx.Graph()
	G.add_nodes_from(df["searchTerms"])

	for _, row in df.iterrows():
	for _, r2 in df[df["link"] == row["link"]].iterrows():
	if row["searchTerms"] != r2["searchTerms"]:
	G.add_edge(row["searchTerms"], r2["searchTerms"])

	communities = community.greedy_modularity_communities(G)
	degrees = dict(G.degree())
	colors = ["#" + ''.join(random.choices('0123456789ABCDEF', k=6)) for _ in communities]

	pos = {}
	centers = com_postion(len(communities), scale=3)
	for i, group in enumerate(communities):
	pos.update(node_postion(list(group), scale=0.8, center=centers[i]))

	fig, ax = plt.subplots(figsize=(12, 8), dpi=100)
	nx.draw(G, pos, with_labels=True, ax=ax, node_size=10, font_size=8, edge_color='gray', alpha=0.2)

	for i, group in enumerate(communities):
	nx.draw_networkx_nodes(
	G, pos, nodelist=list(group), node_color=colors[i],
	node_size=[degrees[n] * 10 for n in group], ax=ax
	)

	ax.axis('off')

	# Return cluster assignments
	cluster_rows = []
	for i, group in enumerate(communities):
	for kw in group:
	cluster_rows.append({
	"searchTerms": kw,
	"cluster": i,
	"requestTimestamp": timestamp
	})
	df_clusters = pd.DataFrame(cluster_rows)

	return fig, df_clusters

	# ---------------------------
	# Compare Clusters
	# ---------------------------

	def compare_clusters(df1, df2):
	merged = pd.merge(df1, df2, on=\"searchTerms\", suffixes=(\"_1\", \"_2\"))
	moved = merged[merged[\"cluster_1\"] != merged[\"cluster_2\"]]
	return moved