Spaces:
Runtime error
Runtime error
| import sqlite3 | |
| import pandas as pd | |
| import numpy as np | |
| import networkx as nx | |
| from networkx.algorithms import community | |
| import matplotlib.pyplot as plt | |
| import random | |
| import time | |
| from datetime import datetime | |
| from googleapiclient.discovery import build | |
| import langid | |
| from tld import get_tld | |
| from fuzzywuzzy import fuzz | |
| # --------------------------- | |
| # Utility Functions | |
| # --------------------------- | |
| def language_detection(text): | |
| return langid.classify(text)[0] | |
| def extract_mainDomain(url): | |
| try: | |
| res = get_tld(url, as_object=True) | |
| return res.fld | |
| except Exception: | |
| return "" | |
| def fuzzy_ratio(str1, str2): | |
| return fuzz.ratio(str1, str2) | |
| def fuzzy_token_set_ratio(str1, str2): | |
| return fuzz.token_set_ratio(str1, str2) | |
| # --------------------------- | |
| # Google Custom Search | |
| # --------------------------- | |
| def google_search(query, api_key, cse_id, hl, gl): | |
| try: | |
| service = build("customsearch", "v1", developerKey=api_key, cache_discovery=False) | |
| res = service.cse().list( | |
| q=query, hl=hl, gl=gl, cx=cse_id, | |
| fields='queries(request(totalResults,searchTerms,hl,gl)),items(title,displayLink,link,snippet)', | |
| num=10 | |
| ).execute() | |
| time.sleep(1) | |
| return res | |
| except Exception as e: | |
| print("Search error:", e) | |
| return None | |
| # --------------------------- | |
| # Fetch and Store Search Results | |
| # --------------------------- | |
| def getSearchResult(keywords, hl, gl, api_key, cse_id, database, table): | |
| timestamp = datetime.now() | |
| rows = [] | |
| for query in keywords: | |
| result = google_search(query, api_key, cse_id, hl, gl) | |
| if result and "items" in result: | |
| for i, item in enumerate(result["items"]): | |
| snippet = item.get("snippet", "") | |
| title = item.get("title", "") | |
| rows.append({ | |
| "requestTimestamp": timestamp, | |
| "searchTerms": query, | |
| "gl": gl, | |
| "hl": hl, | |
| "totalResults": result["queries"]["request"][0]["totalResults"], | |
| "link": item["link"], | |
| "displayLink": item["displayLink"], | |
| "main_domain": extract_mainDomain(item["link"]), | |
| "position": i + 1, | |
| "snippet": snippet, | |
| "snipped_language": language_detection(snippet), | |
| "snippet_matchScore_order": fuzzy_ratio(snippet, query), | |
| "snippet_matchScore_token": fuzzy_token_set_ratio(snippet, query), | |
| "title": title, | |
| "title_matchScore_order": fuzzy_ratio(title, query), | |
| "title_matchScore_token": fuzzy_token_set_ratio(title, query), | |
| }) | |
| df = pd.DataFrame(rows) | |
| with sqlite3.connect(database) as conn: | |
| df.to_sql(table, index=False, if_exists="append", dtype={"requestTimestamp": "DateTime"}) | |
| # --------------------------- | |
| # Cluster Graphs | |
| # --------------------------- | |
| def com_postion(n, scale=1, center=(0, 0)): | |
| theta = np.linspace(0, 2 * np.pi, n, endpoint=False) | |
| pos = np.column_stack((np.cos(theta), np.sin(theta))) | |
| return scale * pos + np.array(center) | |
| def node_postion(nodes, scale=1, center=(0, 0)): | |
| n = len(nodes) | |
| theta = np.linspace(0, 2 * np.pi, n, endpoint=False) | |
| pos = np.column_stack((np.cos(theta), np.sin(theta))) | |
| return dict(zip(nodes, scale * pos + np.array(center))) | |
| def getClustersWithGraph(database, serp_table, timestamp="max"): | |
| with sqlite3.connect(database) as conn: | |
| if timestamp == "max": | |
| query = f''' | |
| SELECT * FROM {serp_table} | |
| WHERE requestTimestamp = (SELECT MAX(requestTimestamp) FROM {serp_table}) | |
| ''' | |
| else: | |
| query = f''' | |
| SELECT * FROM {serp_table} | |
| WHERE requestTimestamp = "{timestamp}" | |
| ''' | |
| df = pd.read_sql(query, conn) | |
| G = nx.Graph() | |
| G.add_nodes_from(df["searchTerms"]) | |
| for _, row in df.iterrows(): | |
| for _, r2 in df[df["link"] == row["link"]].iterrows(): | |
| if row["searchTerms"] != r2["searchTerms"]: | |
| G.add_edge(row["searchTerms"], r2["searchTerms"]) | |
| communities = community.greedy_modularity_communities(G) | |
| degrees = dict(G.degree()) | |
| colors = ["#" + ''.join(random.choices('0123456789ABCDEF', k=6)) for _ in communities] | |
| pos = {} | |
| centers = com_postion(len(communities), scale=3) | |
| for i, group in enumerate(communities): | |
| pos.update(node_postion(list(group), scale=0.8, center=centers[i])) | |
| fig, ax = plt.subplots(figsize=(12, 8), dpi=100) | |
| nx.draw(G, pos, with_labels=True, ax=ax, node_size=10, font_size=8, edge_color='gray', alpha=0.2) | |
| for i, group in enumerate(communities): | |
| nx.draw_networkx_nodes( | |
| G, pos, nodelist=list(group), node_color=colors[i], | |
| node_size=[degrees[n] * 10 for n in group], ax=ax | |
| ) | |
| ax.axis('off') | |
| # Return cluster assignments | |
| cluster_rows = [] | |
| for i, group in enumerate(communities): | |
| for kw in group: | |
| cluster_rows.append({ | |
| "searchTerms": kw, | |
| "cluster": i, | |
| "requestTimestamp": timestamp | |
| }) | |
| df_clusters = pd.DataFrame(cluster_rows) | |
| return fig, df_clusters | |
| # --------------------------- | |
| # Compare Clusters | |
| # --------------------------- | |
| def compare_clusters(df1, df2): | |
| merged = pd.merge(df1, df2, on=\"searchTerms\", suffixes=(\"_1\", \"_2\")) | |
| moved = merged[merged[\"cluster_1\"] != merged[\"cluster_2\"]] | |
| return moved | |