jobsonar / analysis /network.py
MiniMing
feat: initial JobSonar project setup
76a8b94
"""๊ธฐ์ˆ  ์Šคํƒ ๊ณต๋™ ์ถœํ˜„ ๋„คํŠธ์›Œํฌ ๋ถ„์„ (NetworkX)."""
import sqlite3
from itertools import combinations
from collections import Counter
import networkx as nx
import pandas as pd
def build_cooccurrence_graph(
conn: sqlite3.Connection,
category: str | None = None,
min_cooccur: int = 3,
) -> nx.Graph:
"""
๊ฐ™์€ ๊ณต๊ณ ์— ํ•จ๊ป˜ ๋“ฑ์žฅํ•˜๋Š” ์Šคํ‚ฌ ์Œ์œผ๋กœ ๊ทธ๋ž˜ํ”„ ๊ตฌ์„ฑ.
Args:
category: None์ด๋ฉด ์ „์ฒด ์ง๊ตฐ
min_cooccur: ์ตœ์†Œ ๊ณต๋™ ์ถœํ˜„ ํšŸ์ˆ˜ (์—ฃ์ง€ ํ•„ํ„ฐ)
Returns:
G: nodes = ์Šคํ‚ฌ, edges = (์Šคํ‚ฌA, ์Šคํ‚ฌB, weight=๊ณต๋™์ถœํ˜„์ˆ˜)
"""
where = "WHERE j.is_active = 1"
params: list = []
if category:
where += " AND j.job_category = ?"
params.append(category)
rows = pd.read_sql_query(
f"""
SELECT js.job_id, js.skill_name
FROM job_skills js
JOIN jobs j ON js.job_id = j.id
{where}
""",
conn,
params=params,
)
# ๊ณต๊ณ ๋ณ„ ์Šคํ‚ฌ ์ง‘ํ•ฉ
job_skills: dict[int, set[str]] = (
rows.groupby("job_id")["skill_name"]
.apply(set)
.to_dict()
)
# ๊ณต๋™ ์ถœํ˜„ ์นด์šดํŠธ
cooccur: Counter = Counter()
for skills in job_skills.values():
skill_list = sorted(skills)
for a, b in combinations(skill_list, 2):
cooccur[(a, b)] += 1
# ๋…ธ๋“œ ๋นˆ๋„ (๊ฐ ์Šคํ‚ฌ์ด ๋ช‡ ๊ฐœ ๊ณต๊ณ ์— ๋“ฑ์žฅํ–ˆ๋Š”์ง€)
node_freq: Counter = Counter()
for skills in job_skills.values():
node_freq.update(skills)
G = nx.Graph()
for skill, freq in node_freq.items():
G.add_node(skill, frequency=freq)
for (a, b), weight in cooccur.items():
if weight >= min_cooccur:
G.add_edge(a, b, weight=weight)
return G
def get_top_central_skills(G: nx.Graph, top_n: int = 15) -> pd.DataFrame:
"""
์ค‘์‹ฌ์„ฑ ๊ธฐ๋ฐ˜ ์ฃผ์š” ์Šคํ‚ฌ ์ถ”์ถœ.
Returns: columns = [skill, degree_centrality, betweenness, frequency]
"""
if len(G.nodes) == 0:
return pd.DataFrame(columns=["skill", "degree_centrality", "betweenness", "frequency"])
degree_c = nx.degree_centrality(G)
between_c = nx.betweenness_centrality(G, weight="weight")
records = [
{
"skill": node,
"degree_centrality": round(degree_c[node], 4),
"betweenness": round(between_c[node], 4),
"frequency": G.nodes[node].get("frequency", 0),
}
for node in G.nodes
]
return (
pd.DataFrame(records)
.sort_values("degree_centrality", ascending=False)
.head(top_n)
.reset_index(drop=True)
)
def graph_to_plotly_traces(G: nx.Graph) -> tuple[list, list]:
"""
Plotly scatter ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜ (๋Œ€์‹œ๋ณด๋“œ์šฉ).
Returns: (edge_traces, node_traces) โ€” Plotly go.Scatter ๋ฐ์ดํ„ฐ
"""
import plotly.graph_objects as go
pos = nx.spring_layout(G, seed=42, k=0.8)
# ์—ฃ์ง€
edge_x, edge_y = [], []
for u, v in G.edges():
x0, y0 = pos[u]
x1, y1 = pos[v]
edge_x += [x0, x1, None]
edge_y += [y0, y1, None]
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
mode="lines",
line=dict(width=0.5, color="#aaa"),
hoverinfo="none",
name="connections",
)
# ๋…ธ๋“œ
node_x = [pos[n][0] for n in G.nodes]
node_y = [pos[n][1] for n in G.nodes]
node_text = list(G.nodes)
node_size = [max(8, G.nodes[n].get("frequency", 1) ** 0.6) for n in G.nodes]
node_trace = go.Scatter(
x=node_x, y=node_y,
mode="markers+text",
text=node_text,
textposition="top center",
marker=dict(
size=node_size,
color=[G.degree(n) for n in G.nodes],
colorscale="Viridis",
showscale=True,
colorbar=dict(title="์—ฐ๊ฒฐ ์ˆ˜"),
),
hovertemplate="<b>%{text}</b><br>๊ณต๊ณ  ์ˆ˜: %{marker.size}<extra></extra>",
name="skills",
)
return [edge_trace], [node_trace]