math-under-llm / db /reader.py
Alex W.
feat(db+ui): add modality dimension & migrate legacy data
0ff8a89
# db/reader.py
"""
ๆ•ฐๆฎๅบ“ๆŸฅ่ฏขๆจกๅ—
- ๆŽ’่กŒๆฆœๆŸฅ่ฏข
- ๆจกๅž‹่ฏฆๆƒ…ๆŸฅ่ฏข๏ผˆๆ–นๆกˆA๏ผšๆŒ‰modality่šๅˆ + ๆ–นๆกˆB๏ผšๅŽŸๅง‹components่กŒ๏ผ‰
- ้€ๅคดๅŽŸๅง‹ๆ•ฐๆฎๆŸฅ่ฏข
- ๆ–ญ็‚น็ปญไผ ็Šถๆ€ๆŸฅ่ฏข
"""
import sqlite3
import pandas as pd
from db.schema import get_connection, init_db
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ๆŽ’่กŒๆฆœ
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_leaderboard(
conn: sqlite3.Connection,
modality: str = "language", # language/vision/audio/all
layer_type: str = "standard",
limit: int = 100,
) -> pd.DataFrame:
"""
ๆŽ’่กŒๆฆœๆŸฅ่ฏข๏ผŒๆŒ‰ wang_score ้™ๅบใ€‚
modality ่ฟ‡ๆปค้€š่ฟ‡ components ่กจ็š„ prefix ๅ…ณ่”ๅฎž็Žฐใ€‚
"""
sql = """
SELECT
s.model_id,
s.prefix,
s.layer_type,
s.wang_score,
s.median_pearson_QK,
s.median_ssr_QK,
s.mean_ssr_QK,
s.median_cosU_QK,
s.median_cosU_QV,
s.median_cosV_QK,
s.median_cond_Q,
s.n_layers,
s.n_records,
s.updated_at,
c.modality,
c.head_dim_min,
c.head_dim_max,
c.has_kv_shared,
c.has_global,
c.d_model
FROM model_summary s
LEFT JOIN components c
ON s.model_id = c.model_id AND s.prefix = c.prefix
WHERE s.layer_type = ?
"""
params = [layer_type]
if modality != "all":
sql += " AND c.modality = ?"
params.append(modality)
sql += " ORDER BY s.wang_score DESC LIMIT ?"
params.append(limit)
cur = conn.cursor()
cur.execute(sql, params)
rows = cur.fetchall()
if not rows:
return pd.DataFrame()
cols = [d[0] for d in cur.description]
return pd.DataFrame([dict(zip(cols, row)) for row in rows])
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ๆจกๅž‹ๅˆ—่กจ๏ผˆๆ–นๆกˆA๏ผšๆŒ‰modality่šๅˆ๏ผ‰
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_analyzed_models(conn: sqlite3.Connection) -> pd.DataFrame:
"""
ๆจกๅž‹ๅˆ—่กจ๏ผŒๆŒ‰ modality ่šๅˆๅฑ‚ๆ•ฐใ€‚
language_layers = SUM(n_layers) WHERE modality='language'
่‡ชๅŠจๅŒ…ๅซ standard + global ๅฑ‚๏ผˆๅŒไธ€ prefix ไธ‹๏ผ‰ใ€‚
"""
cur = conn.cursor()
cur.execute(
"""
SELECT
m.model_id,
m.model_type,
m.analyzed_at,
m.analyze_sec,
COUNT(DISTINCT c.prefix) AS n_components,
SUM(CASE WHEN c.modality = 'language'
THEN c.n_layers ELSE 0 END) AS language_layers,
SUM(CASE WHEN c.modality = 'vision'
THEN c.n_layers ELSE 0 END) AS vision_layers,
SUM(CASE WHEN c.modality = 'audio'
THEN c.n_layers ELSE 0 END) AS audio_layers
FROM models m
LEFT JOIN components c ON m.model_id = c.model_id
GROUP BY m.model_id
ORDER BY m.analyzed_at DESC
"""
)
rows = cur.fetchall()
if not rows:
return pd.DataFrame()
cols = [d[0] for d in cur.description]
return pd.DataFrame([dict(zip(cols, row)) for row in rows])
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ๆจกๅž‹่ฏฆๆƒ…๏ผˆๆ–นๆกˆB๏ผšๅŽŸๅง‹components่กŒ๏ผ‰
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_model_components(
conn: sqlite3.Connection,
model_id: str,
) -> pd.DataFrame:
"""
่ฟ”ๅ›žๆŸๆจกๅž‹็š„ๅŽŸๅง‹ components ่กŒ๏ผˆๆ–นๆกˆB่ฏฆๆƒ…ๅฑ•ๅผ€็”จ๏ผ‰ใ€‚
ๆฏ่กŒ = ไธ€ไธช prefix๏ผŒๅซ modality/n_layers/head_dim ็ญ‰ใ€‚
"""
cur = conn.cursor()
cur.execute(
"""SELECT
prefix, modality, n_layers,
head_dim_min, head_dim_max,
has_kv_shared, has_global, d_model
FROM components
WHERE model_id = ?
ORDER BY modality, prefix""",
(model_id,)
)
rows = cur.fetchall()
if not rows:
return pd.DataFrame()
cols = [d[0] for d in cur.description]
return pd.DataFrame([dict(zip(cols, row)) for row in rows])
def get_model_summary(
conn: sqlite3.Connection,
model_id: str,
) -> pd.DataFrame:
"""่Žทๅ–ๆŸๆจกๅž‹ๆ‰€ๆœ‰็ป„ไปถ็š„ๆฑ‡ๆ€ป็ปŸ่ฎก"""
cur = conn.cursor()
cur.execute(
"""SELECT * FROM model_summary
WHERE model_id = ?
ORDER BY prefix, layer_type""",
(model_id,)
)
rows = cur.fetchall()
if not rows:
return pd.DataFrame()
cols = [d[0] for d in cur.description]
return pd.DataFrame([dict(zip(cols, row)) for row in rows])
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ้€ๅคดๅŽŸๅง‹ๆ•ฐๆฎ
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_layer_metrics(
conn: sqlite3.Connection,
model_id: str,
prefix: str = None,
modality: str = None, # language/vision/audio
layer_type: str = None, # standard/global
start_layer: int = None,
end_layer: int = None,
) -> pd.DataFrame:
"""
้€ๅคดๅŽŸๅง‹ๆ•ฐๆฎๆŸฅ่ฏขใ€‚
modality ๅ’Œ layer_type ๆ˜ฏไธคไธช็‹ฌ็ซ‹็ปดๅบฆ๏ผŒๅฏไปฅ็ป„ๅˆ่ฟ‡ๆปคใ€‚
"""
sql = "SELECT * FROM layer_head_metrics WHERE model_id = ?"
params = [model_id]
if prefix:
sql += " AND prefix = ?"
params.append(prefix)
if modality:
sql += " AND modality = ?"
params.append(modality)
if layer_type:
sql += " AND layer_type = ?"
params.append(layer_type)
if start_layer is not None:
sql += " AND layer >= ?"
params.append(start_layer)
if end_layer is not None:
sql += " AND layer <= ?"
params.append(end_layer)
sql += " ORDER BY prefix, layer, kv_head, q_head"
cur = conn.cursor()
cur.execute(sql, params)
rows = cur.fetchall()
if not rows:
return pd.DataFrame()
cols = [d[0] for d in cur.description]
return pd.DataFrame([dict(zip(cols, row)) for row in rows])
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# ๆ–ญ็‚น็ปญไผ ็Šถๆ€
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def get_resume_status(
conn: sqlite3.Connection,
model_id: str,
prefix: str,
) -> dict:
cur = conn.cursor()
cur.execute(
"""SELECT DISTINCT layer, COUNT(*) as n_heads
FROM layer_head_metrics
WHERE model_id = ? AND prefix = ?
GROUP BY layer ORDER BY layer""",
(model_id, prefix)
)
rows = cur.fetchall()
done_layers = {r[0]: r[1] for r in rows}
return {
"done_layers": set(done_layers.keys()),
"layer_detail": done_layers,
"total_done": len(done_layers),
}