Spaces:
Runtime error
Runtime error
Upload 23 files
Browse files- app.py +107 -296
- conversation_storyline/__init__.py +12 -0
- conversation_storyline/__pycache__/__init__.cpython-313.pyc +0 -0
- conversation_storyline/__pycache__/config.cpython-313.pyc +0 -0
- conversation_storyline/__pycache__/embeddings.cpython-313.pyc +0 -0
- conversation_storyline/__pycache__/io.cpython-313.pyc +0 -0
- conversation_storyline/__pycache__/pipeline.cpython-313.pyc +0 -0
- conversation_storyline/__pycache__/reply_to.cpython-313.pyc +0 -0
- conversation_storyline/__pycache__/schemas.cpython-313.pyc +0 -0
- conversation_storyline/__pycache__/topic_shifts.cpython-313.pyc +0 -0
- conversation_storyline/config.py +33 -0
- conversation_storyline/embeddings.py +53 -0
- conversation_storyline/io.py +49 -0
- conversation_storyline/layout_heuristic.py +29 -0
- conversation_storyline/layout_ilp.py +170 -0
- conversation_storyline/openai_refiner.py +82 -0
- conversation_storyline/pipeline.py +187 -0
- conversation_storyline/plots.py +83 -0
- conversation_storyline/render.py +45 -0
- conversation_storyline/reply_to.py +154 -0
- conversation_storyline/schemas.py +31 -0
- conversation_storyline/topic_shifts.py +70 -0
- requirements.txt +13 -11
app.py
CHANGED
|
@@ -1,296 +1,107 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import
|
| 3 |
-
import
|
| 4 |
-
import
|
| 5 |
-
|
| 6 |
-
from
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
summary = f"Mensajes: {len(interactions)} | Participantes: {len(speakers)} | Modelo: {model_name}"
|
| 109 |
-
return interactions, metrics_df, layout_df, topic_shifts, summary
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
# ----------------------------
|
| 113 |
-
# Plot builders
|
| 114 |
-
# ----------------------------
|
| 115 |
-
def render_storyline_png(layout_df: pd.DataFrame, topic_shifts: List[int], out_path: str) -> str:
|
| 116 |
-
"""
|
| 117 |
-
Minimal storyline render:
|
| 118 |
-
- x axis: t_start
|
| 119 |
-
- y axis: lane position
|
| 120 |
-
"""
|
| 121 |
-
if layout_df is None or layout_df.empty:
|
| 122 |
-
# Create an empty placeholder image
|
| 123 |
-
fig, ax = plt.subplots(figsize=(10, 3))
|
| 124 |
-
ax.text(0.5, 0.5, "No layout data", ha="center", va="center")
|
| 125 |
-
ax.axis("off")
|
| 126 |
-
fig.savefig(out_path, dpi=160, bbox_inches="tight")
|
| 127 |
-
plt.close(fig)
|
| 128 |
-
return out_path
|
| 129 |
-
|
| 130 |
-
df = layout_df.copy()
|
| 131 |
-
if "y_smooth" not in df.columns:
|
| 132 |
-
df["y_smooth"] = df["y"]
|
| 133 |
-
|
| 134 |
-
fig, ax = plt.subplots(figsize=(14, 6))
|
| 135 |
-
for sp, g in df.groupby("speaker", sort=False):
|
| 136 |
-
g = g.sort_values("t_start")
|
| 137 |
-
ax.plot(g["t_start"], g["y_smooth"], linewidth=1.5)
|
| 138 |
-
# label at start
|
| 139 |
-
ax.text(g["t_start"].iloc[0], g["y_smooth"].iloc[0], str(sp), fontsize=9)
|
| 140 |
-
|
| 141 |
-
# topic shifts as vertical lines (approx: by message id; if your x is t_start, adapt mapping)
|
| 142 |
-
for x in topic_shifts:
|
| 143 |
-
ax.axvline(x=x, linewidth=1.0, linestyle="--", alpha=0.6)
|
| 144 |
-
|
| 145 |
-
ax.set_title("Storyline")
|
| 146 |
-
ax.set_xlabel("t")
|
| 147 |
-
ax.set_yticks([])
|
| 148 |
-
fig.tight_layout()
|
| 149 |
-
fig.savefig(out_path, dpi=160, bbox_inches="tight")
|
| 150 |
-
plt.close(fig)
|
| 151 |
-
return out_path
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
def plot_sentiment(interactions: List[dict]):
|
| 155 |
-
df = pd.DataFrame(interactions)
|
| 156 |
-
if df.empty:
|
| 157 |
-
return go.Figure()
|
| 158 |
-
return px.scatter(df, x="id", y="sentiment_score", color="speaker", title="Sentiment timeline")
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
def plot_reply_distance_hist(interactions: List[dict]):
|
| 162 |
-
df = pd.DataFrame(interactions)
|
| 163 |
-
if df.empty:
|
| 164 |
-
return go.Figure()
|
| 165 |
-
d = df.dropna(subset=["reply_to_id"]).assign(dist=lambda x: x["id"] - x["reply_to_id"])
|
| 166 |
-
return px.histogram(d, x="dist", nbins=50, title="Reply distance histogram")
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
def plot_bump(metrics_df: pd.DataFrame, top_n: int = 20):
|
| 170 |
-
if metrics_df is None or metrics_df.empty:
|
| 171 |
-
return go.Figure()
|
| 172 |
-
avg = metrics_df.groupby("speaker")["centrality"].mean().sort_values(ascending=False).head(top_n)
|
| 173 |
-
df = metrics_df[metrics_df["speaker"].isin(avg.index)].copy()
|
| 174 |
-
df["rank"] = df.groupby("t_start")["centrality"].rank(ascending=False, method="dense")
|
| 175 |
-
fig = px.line(df, x="t_start", y="rank", color="speaker", title=f"Bump chart (Top {top_n})")
|
| 176 |
-
fig.update_yaxes(autorange="reversed")
|
| 177 |
-
return fig
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
def plot_heatmap(metrics_df: pd.DataFrame, top_n: int = 20):
|
| 181 |
-
if metrics_df is None or metrics_df.empty:
|
| 182 |
-
return go.Figure()
|
| 183 |
-
avg = metrics_df.groupby("speaker")["centrality"].mean().sort_values(ascending=False).head(top_n)
|
| 184 |
-
df = metrics_df[metrics_df["speaker"].isin(avg.index)].copy()
|
| 185 |
-
piv = df.pivot_table(index="speaker", columns="t_start", values="centrality", aggfunc="mean").fillna(0)
|
| 186 |
-
return px.imshow(piv, aspect="auto", title=f"Centrality heatmap (Top {top_n})")
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
def plot_topic_sankey(interactions: List[dict]):
|
| 190 |
-
df = pd.DataFrame(interactions)
|
| 191 |
-
if df.empty or "topic_label" not in df.columns:
|
| 192 |
-
return go.Figure()
|
| 193 |
-
|
| 194 |
-
topics = df["topic_label"].astype(str).tolist()
|
| 195 |
-
links: Dict[Tuple[str, str], int] = {}
|
| 196 |
-
for a, b in zip(topics[:-1], topics[1:]):
|
| 197 |
-
if a == b:
|
| 198 |
-
continue
|
| 199 |
-
links[(a, b)] = links.get((a, b), 0) + 1
|
| 200 |
-
|
| 201 |
-
if not links:
|
| 202 |
-
return go.Figure()
|
| 203 |
-
|
| 204 |
-
nodes = sorted(set([t for ab in links.keys() for t in ab]))
|
| 205 |
-
idx = {n: i for i, n in enumerate(nodes)}
|
| 206 |
-
|
| 207 |
-
fig = go.Figure(
|
| 208 |
-
data=[
|
| 209 |
-
go.Sankey(
|
| 210 |
-
node=dict(label=nodes),
|
| 211 |
-
link=dict(
|
| 212 |
-
source=[idx[a] for (a, b) in links.keys()],
|
| 213 |
-
target=[idx[b] for (a, b) in links.keys()],
|
| 214 |
-
value=list(links.values()),
|
| 215 |
-
),
|
| 216 |
-
)
|
| 217 |
-
]
|
| 218 |
-
)
|
| 219 |
-
fig.update_layout(title="Topic transitions (Sankey)")
|
| 220 |
-
return fig
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
# ----------------------------
|
| 224 |
-
# Gradio callback (NO global state)
|
| 225 |
-
# ----------------------------
|
| 226 |
-
def process_transcript(transcript_text: str, model_name: str):
|
| 227 |
-
# Run your pipeline (replace run_analysis internals with your real pipeline)
|
| 228 |
-
interactions, metrics_df, layout_df, topic_shifts, summary = run_analysis(transcript_text, model_name)
|
| 229 |
-
|
| 230 |
-
# ✅ This is where your old code crashed:
|
| 231 |
-
# all_interactions was not defined. Here we use `interactions`.
|
| 232 |
-
active_participants = sorted({n["speaker"] for n in interactions}) if interactions else []
|
| 233 |
-
|
| 234 |
-
# Build image
|
| 235 |
-
tmpdir = tempfile.mkdtemp(prefix="storyline_")
|
| 236 |
-
img_path = os.path.join(tmpdir, "storyline.png")
|
| 237 |
-
render_storyline_png(layout_df, topic_shifts, img_path)
|
| 238 |
-
|
| 239 |
-
# Plots
|
| 240 |
-
sentiment_fig = plot_sentiment(interactions)
|
| 241 |
-
bump_fig = plot_bump(metrics_df, top_n=25)
|
| 242 |
-
heatmap_fig = plot_heatmap(metrics_df, top_n=25)
|
| 243 |
-
hist_fig = plot_reply_distance_hist(interactions)
|
| 244 |
-
sankey_fig = plot_topic_sankey(interactions)
|
| 245 |
-
|
| 246 |
-
# Summary text
|
| 247 |
-
summary_full = summary + f"\nParticipantes activos: {', '.join(active_participants[:50])}" + (
|
| 248 |
-
" ..." if len(active_participants) > 50 else ""
|
| 249 |
-
)
|
| 250 |
-
|
| 251 |
-
return img_path, summary_full, sentiment_fig, bump_fig, heatmap_fig, hist_fig, sankey_fig
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
# ------------------------------------------------------------------
|
| 255 |
-
# Theme + UI (ONLY ONCE)
|
| 256 |
-
# ------------------------------------------------------------------
|
| 257 |
-
theme = gr.themes.Soft(primary_hue="blue").set(
|
| 258 |
-
body_background_fill="*neutral_50",
|
| 259 |
-
block_background_fill="*neutral_100",
|
| 260 |
-
)
|
| 261 |
-
|
| 262 |
-
with gr.Blocks(
|
| 263 |
-
title="Conversation Storyline Visualizer – Advanced",
|
| 264 |
-
theme=theme, # ✅ theme belongs here (more compatible than demo.launch(theme=...))
|
| 265 |
-
) as demo:
|
| 266 |
-
gr.Markdown("# Visualización Narrativa Avanzada de Conversaciones")
|
| 267 |
-
gr.Markdown("Soporte para conversaciones largas con chunking + refinamiento reply_to y topic shifts ensemble.")
|
| 268 |
-
|
| 269 |
-
with gr.Row():
|
| 270 |
-
model_selector = gr.Dropdown(
|
| 271 |
-
choices=["gpt-4o-2024-08-06", "gpt-4o-mini-2024-07-18"],
|
| 272 |
-
value="gpt-4o-2024-08-06",
|
| 273 |
-
label="Modelo OpenAI",
|
| 274 |
-
)
|
| 275 |
-
|
| 276 |
-
input_text = gr.Textbox(label="Transcripción", lines=20)
|
| 277 |
-
btn = gr.Button("Generar Visualizaciones", variant="primary")
|
| 278 |
-
|
| 279 |
-
with gr.Tabs():
|
| 280 |
-
with gr.Tab("Storyline Principal"):
|
| 281 |
-
main_img = gr.Image(label="Storyline (PNG)")
|
| 282 |
-
summary_box = gr.Textbox(label="Resumen", lines=6)
|
| 283 |
-
with gr.Tab("Análisis Detallado"):
|
| 284 |
-
sentiment_plot = gr.Plot(label="Sentimiento")
|
| 285 |
-
bump_plot = gr.Plot(label="Ranking (Bump)")
|
| 286 |
-
heatmap_plot = gr.Plot(label="Heatmap centralidad")
|
| 287 |
-
hist_plot = gr.Plot(label="Hist reply-distance")
|
| 288 |
-
sankey_plot = gr.Plot(label="Sankey topics")
|
| 289 |
-
|
| 290 |
-
btn.click(
|
| 291 |
-
fn=process_transcript,
|
| 292 |
-
inputs=[input_text, model_selector],
|
| 293 |
-
outputs=[main_img, summary_box, sentiment_plot, bump_plot, heatmap_plot, hist_plot, sankey_plot],
|
| 294 |
-
)
|
| 295 |
-
|
| 296 |
-
demo.launch()
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
from conversation_storyline.pipeline import run_pipeline_from_text
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
THEME = gr.themes.Soft(primary_hue="blue").set(
|
| 10 |
+
body_background_fill="*neutral_50",
|
| 11 |
+
block_background_fill="*neutral_100",
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
TITLE = "Conversation Storyline Visualizer – v4"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def process_transcript(transcript: str, model_selector: str):
|
| 18 |
+
"""
|
| 19 |
+
Entrada: texto pegado (transcripción)
|
| 20 |
+
Salida: storyline.png + resumen + figs plotly
|
| 21 |
+
"""
|
| 22 |
+
if not transcript or not transcript.strip():
|
| 23 |
+
raise gr.Error("Pega una transcripción en el cuadro de texto.")
|
| 24 |
+
|
| 25 |
+
outdir = Path(tempfile.mkdtemp(prefix="storyline_v4_"))
|
| 26 |
+
|
| 27 |
+
outputs = run_pipeline_from_text(
|
| 28 |
+
transcript_text=transcript,
|
| 29 |
+
out_dir=outdir,
|
| 30 |
+
openai_model=model_selector,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
return (
|
| 34 |
+
str(outputs["storyline_png"]),
|
| 35 |
+
outputs["summary_text"],
|
| 36 |
+
outputs["fig_sentiment"],
|
| 37 |
+
outputs["fig_bump"],
|
| 38 |
+
outputs["fig_heatmap"],
|
| 39 |
+
outputs["fig_hist_reply_dist"],
|
| 40 |
+
outputs["fig_sankey"],
|
| 41 |
+
outputs["storyline_html"],
|
| 42 |
+
outputs["metrics_csv"],
|
| 43 |
+
outputs["interactions_jsonl"],
|
| 44 |
+
outputs["graph_json"],
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
with gr.Blocks(title=TITLE, theme=THEME) as demo:
|
| 49 |
+
gr.Markdown("# Visualización Narrativa Avanzada de Conversaciones (v4)")
|
| 50 |
+
gr.Markdown(
|
| 51 |
+
"- Pega una transcripción tipo `Speaker A: ...`\n"
|
| 52 |
+
"- Soporta conversaciones largas (chunking)\n"
|
| 53 |
+
"- Reply-to Top-K embeddings + topic shifts ruptures + layout OR-Tools\n"
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
with gr.Row():
|
| 57 |
+
model_selector = gr.Dropdown(
|
| 58 |
+
choices=[
|
| 59 |
+
"gpt-4o-2024-08-06",
|
| 60 |
+
"gpt-4o-mini-2024-07-18",
|
| 61 |
+
"none (offline)",
|
| 62 |
+
],
|
| 63 |
+
value="none (offline)",
|
| 64 |
+
label="Modelo (opcional; si hay OPENAI_API_KEY)",
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
input_text = gr.Textbox(label="Transcripción", lines=20, placeholder="Pega aquí la transcripción...")
|
| 68 |
+
btn = gr.Button("Generar Visualizaciones", variant="primary")
|
| 69 |
+
|
| 70 |
+
with gr.Tabs():
|
| 71 |
+
with gr.Tab("Storyline Principal"):
|
| 72 |
+
main_img = gr.Image(label="Storyline (PNG)")
|
| 73 |
+
summary_box = gr.Textbox(label="Resumen", lines=10)
|
| 74 |
+
storyline_html = gr.HTML(label="Storyline (HTML embebido)")
|
| 75 |
+
with gr.Tab("Análisis Detallado"):
|
| 76 |
+
sentiment_plot = gr.Plot(label="Sentiment (si aplica)")
|
| 77 |
+
bump_plot = gr.Plot(label="Bump actividad por segmento")
|
| 78 |
+
heatmap_plot = gr.Plot(label="Heatmap interacciones")
|
| 79 |
+
hist_plot = gr.Plot(label="Histograma distancia reply_to")
|
| 80 |
+
sankey_plot = gr.Plot(label="Sankey Speaker → Topic")
|
| 81 |
+
|
| 82 |
+
with gr.Tab("Descargas"):
|
| 83 |
+
metrics_csv = gr.File(label="metrics.csv")
|
| 84 |
+
interactions_jsonl = gr.File(label="interactions.jsonl")
|
| 85 |
+
graph_json = gr.File(label="graph.json")
|
| 86 |
+
|
| 87 |
+
btn.click(
|
| 88 |
+
fn=process_transcript,
|
| 89 |
+
inputs=[input_text, model_selector],
|
| 90 |
+
outputs=[
|
| 91 |
+
main_img,
|
| 92 |
+
summary_box,
|
| 93 |
+
sentiment_plot,
|
| 94 |
+
bump_plot,
|
| 95 |
+
heatmap_plot,
|
| 96 |
+
hist_plot,
|
| 97 |
+
sankey_plot,
|
| 98 |
+
storyline_html,
|
| 99 |
+
metrics_csv,
|
| 100 |
+
interactions_jsonl,
|
| 101 |
+
graph_json,
|
| 102 |
+
],
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
# HF Spaces
|
| 107 |
+
demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
conversation_storyline/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__all__ = [
|
| 2 |
+
"pipeline",
|
| 3 |
+
"io",
|
| 4 |
+
"schemas",
|
| 5 |
+
"embeddings",
|
| 6 |
+
"reply_to",
|
| 7 |
+
"topic_shifts",
|
| 8 |
+
"layout_ilp",
|
| 9 |
+
"layout_heuristic",
|
| 10 |
+
"render",
|
| 11 |
+
"plots",
|
| 12 |
+
]
|
conversation_storyline/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (314 Bytes). View file
|
|
|
conversation_storyline/__pycache__/config.cpython-313.pyc
ADDED
|
Binary file (1.49 kB). View file
|
|
|
conversation_storyline/__pycache__/embeddings.cpython-313.pyc
ADDED
|
Binary file (3.08 kB). View file
|
|
|
conversation_storyline/__pycache__/io.cpython-313.pyc
ADDED
|
Binary file (2.04 kB). View file
|
|
|
conversation_storyline/__pycache__/pipeline.cpython-313.pyc
ADDED
|
Binary file (8.41 kB). View file
|
|
|
conversation_storyline/__pycache__/reply_to.cpython-313.pyc
ADDED
|
Binary file (5.42 kB). View file
|
|
|
conversation_storyline/__pycache__/schemas.cpython-313.pyc
ADDED
|
Binary file (1.42 kB). View file
|
|
|
conversation_storyline/__pycache__/topic_shifts.cpython-313.pyc
ADDED
|
Binary file (3.76 kB). View file
|
|
|
conversation_storyline/config.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclass(frozen=True)
|
| 6 |
+
class Settings:
|
| 7 |
+
# Parsing / ingest
|
| 8 |
+
max_speaker_label_len: int = 64
|
| 9 |
+
|
| 10 |
+
# Reply-to
|
| 11 |
+
reply_window: int = 60 # candidatos previos a considerar
|
| 12 |
+
reply_top_k: int = 10 # top-k por embeddings
|
| 13 |
+
reply_min_sim: float = 0.25 # si top1 < umbral -> puede quedar None (offline)
|
| 14 |
+
reply_ambig_delta: float = 0.03 # si top1-top2 < delta -> candidato a LLM (si hay)
|
| 15 |
+
|
| 16 |
+
# Topic shifts (ruptures)
|
| 17 |
+
topic_min_size: int = 8
|
| 18 |
+
topic_penalty_scale: float = 2.4 # más alto -> menos cortes
|
| 19 |
+
|
| 20 |
+
# Layout (OR-Tools)
|
| 21 |
+
ilp_time_limit_s: float = 6.0
|
| 22 |
+
ilp_max_participants: int = 28 # si más, fallback
|
| 23 |
+
ilp_max_segments: int = 120 # si más, fallback
|
| 24 |
+
|
| 25 |
+
# Rendering
|
| 26 |
+
storyline_dpi: int = 180
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
settings = Settings()
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def has_openai_key() -> bool:
|
| 33 |
+
return bool(os.getenv("OPENAI_API_KEY", "").strip())
|
conversation_storyline/embeddings.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass
|
| 9 |
+
class Embedder:
|
| 10 |
+
"""
|
| 11 |
+
Embeddings CPU con Sentence-Transformers.
|
| 12 |
+
- Normaliza para cosine-sim rápida.
|
| 13 |
+
- Fallback simple TF-IDF si ST no está disponible.
|
| 14 |
+
"""
|
| 15 |
+
model_name: str = "intfloat/multilingual-e5-small"
|
| 16 |
+
_st = None
|
| 17 |
+
_tfidf = None
|
| 18 |
+
_tfidf_vectorizer = None
|
| 19 |
+
|
| 20 |
+
def _load_st(self):
|
| 21 |
+
if self._st is None:
|
| 22 |
+
from sentence_transformers import SentenceTransformer
|
| 23 |
+
self._st = SentenceTransformer(self.model_name)
|
| 24 |
+
|
| 25 |
+
def encode(self, texts: List[str]) -> np.ndarray:
|
| 26 |
+
texts = [t or "" for t in texts]
|
| 27 |
+
try:
|
| 28 |
+
self._load_st()
|
| 29 |
+
# e5: mejor con "passage: " / "query: " pero aquí vale "passage:"
|
| 30 |
+
inp = [("passage: " + t) for t in texts]
|
| 31 |
+
X = np.array(self._st.encode(inp, normalize_embeddings=True, show_progress_bar=False), dtype=np.float32)
|
| 32 |
+
return X
|
| 33 |
+
except Exception:
|
| 34 |
+
# TF-IDF fallback (no semántico perfecto, pero funciona offline “siempre”)
|
| 35 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 36 |
+
if self._tfidf_vectorizer is None:
|
| 37 |
+
self._tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
|
| 38 |
+
X = self._tfidf_vectorizer.fit_transform(texts).astype(np.float32)
|
| 39 |
+
else:
|
| 40 |
+
X = self._tfidf_vectorizer.transform(texts).astype(np.float32)
|
| 41 |
+
# normalizar filas
|
| 42 |
+
X = X.toarray()
|
| 43 |
+
norms = np.linalg.norm(X, axis=1, keepdims=True) + 1e-9
|
| 44 |
+
return X / norms
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def cosine_sim_matrix(A: np.ndarray, b: np.ndarray) -> np.ndarray:
|
| 48 |
+
"""
|
| 49 |
+
A: (n, d) normalized
|
| 50 |
+
b: (d,) normalized
|
| 51 |
+
return: (n,)
|
| 52 |
+
"""
|
| 53 |
+
return (A @ b).astype(np.float32)
|
conversation_storyline/io.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List
|
| 3 |
+
from .schemas import Interaction
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
SPEAKER_PATTERNS = [
|
| 7 |
+
# Speaker A: ...
|
| 8 |
+
re.compile(r"^(?P<speaker>Speaker\s+[A-Za-z0-9_\- ]{1,64})\s*:\s*(?P<text>.+)\s*$"),
|
| 9 |
+
# A: ...
|
| 10 |
+
re.compile(r"^(?P<speaker>[A-Za-zÁÉÍÓÚÜÑáéíóúüñ0-9_\- ]{1,32})\s*:\s*(?P<text>.+)\s*$"),
|
| 11 |
+
]
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def parse_transcript(text: str) -> List[Interaction]:
|
| 15 |
+
"""
|
| 16 |
+
Robusto para texto pegado:
|
| 17 |
+
- Cada línea que matchee "SPEAKER: ..." crea nuevo mensaje.
|
| 18 |
+
- Líneas sin speaker se anexan al texto del último mensaje (continuación).
|
| 19 |
+
"""
|
| 20 |
+
lines = [l.rstrip() for l in (text or "").splitlines()]
|
| 21 |
+
interactions: List[Interaction] = []
|
| 22 |
+
cur = None
|
| 23 |
+
|
| 24 |
+
for raw in lines:
|
| 25 |
+
line = raw.strip()
|
| 26 |
+
if not line:
|
| 27 |
+
continue
|
| 28 |
+
|
| 29 |
+
matched = None
|
| 30 |
+
for pat in SPEAKER_PATTERNS:
|
| 31 |
+
m = pat.match(line)
|
| 32 |
+
if m:
|
| 33 |
+
matched = m
|
| 34 |
+
break
|
| 35 |
+
|
| 36 |
+
if matched:
|
| 37 |
+
speaker = matched.group("speaker").strip()
|
| 38 |
+
msg = matched.group("text").strip()
|
| 39 |
+
cur = Interaction(message_id=len(interactions), speaker=speaker, text=msg)
|
| 40 |
+
interactions.append(cur)
|
| 41 |
+
else:
|
| 42 |
+
# continuation line
|
| 43 |
+
if cur is None:
|
| 44 |
+
cur = Interaction(message_id=0, speaker="Unknown", text=line)
|
| 45 |
+
interactions.append(cur)
|
| 46 |
+
else:
|
| 47 |
+
cur.text = (cur.text + " " + line).strip()
|
| 48 |
+
|
| 49 |
+
return interactions
|
conversation_storyline/layout_heuristic.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Dict, List, Tuple
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def compute_layout_heuristic(metrics: pd.DataFrame) -> pd.DataFrame:
|
| 9 |
+
"""
|
| 10 |
+
Fallback rápido:
|
| 11 |
+
- ordena speakers por actividad total
|
| 12 |
+
- asigna y fijo por speaker
|
| 13 |
+
"""
|
| 14 |
+
if metrics.empty:
|
| 15 |
+
return metrics
|
| 16 |
+
|
| 17 |
+
speakers = (
|
| 18 |
+
metrics.groupby("speaker")["message_id"]
|
| 19 |
+
.count()
|
| 20 |
+
.sort_values(ascending=False)
|
| 21 |
+
.index
|
| 22 |
+
.tolist()
|
| 23 |
+
)
|
| 24 |
+
y_map = {s: i for i, s in enumerate(speakers)}
|
| 25 |
+
out = metrics.copy()
|
| 26 |
+
out["y"] = out["speaker"].map(y_map).astype(float)
|
| 27 |
+
out["y_smooth"] = out["y"]
|
| 28 |
+
out["line_width"] = 1.0
|
| 29 |
+
return out
|
conversation_storyline/layout_ilp.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Dict, List, Tuple, Set
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
from ortools.sat.python import cp_model
|
| 8 |
+
|
| 9 |
+
from .config import settings
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def compute_storyline_layout_ilp(
|
| 13 |
+
metrics: pd.DataFrame,
|
| 14 |
+
segments: List[Tuple[int, int, int]],
|
| 15 |
+
interactions_edges: Dict[Tuple[str, str], float],
|
| 16 |
+
) -> pd.DataFrame:
|
| 17 |
+
"""
|
| 18 |
+
Layout por segmentos con CP-SAT minimizando:
|
| 19 |
+
- inversiones de orden entre segmentos (proxy de cruces)
|
| 20 |
+
- wiggle (|y_t - y_{t-1}|)
|
| 21 |
+
- distancia entre speakers que interactúan en el mismo segmento
|
| 22 |
+
|
| 23 |
+
metrics: filas por mensaje con speaker, message_id, topic_id, etc.
|
| 24 |
+
segments: (seg_id, start, end)
|
| 25 |
+
interactions_edges: pesos globales (speaker_a, speaker_b) -> weight
|
| 26 |
+
"""
|
| 27 |
+
if metrics.empty:
|
| 28 |
+
return metrics.assign(y=np.nan, y_smooth=np.nan, line_width=1.0)
|
| 29 |
+
|
| 30 |
+
speakers = sorted(metrics["speaker"].unique().tolist())
|
| 31 |
+
P = len(speakers)
|
| 32 |
+
S = len(segments)
|
| 33 |
+
|
| 34 |
+
if P > settings.ilp_max_participants or S > settings.ilp_max_segments:
|
| 35 |
+
# demasiado grande: lo decide pipeline con fallback; aquí por seguridad
|
| 36 |
+
return metrics.assign(y=np.nan, y_smooth=np.nan, line_width=1.0)
|
| 37 |
+
|
| 38 |
+
sp_idx = {s: i for i, s in enumerate(speakers)}
|
| 39 |
+
max_lanes = max(
|
| 40 |
+
metrics.groupby("topic_id")["speaker"].nunique().max(),
|
| 41 |
+
1
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# active[p][s] = bool
|
| 45 |
+
active = [[False] * S for _ in range(P)]
|
| 46 |
+
for seg_id, start, end in segments:
|
| 47 |
+
seg_speakers = set(metrics[(metrics["message_id"] >= start) & (metrics["message_id"] < end)]["speaker"])
|
| 48 |
+
for sp in seg_speakers:
|
| 49 |
+
active[sp_idx[sp]][seg_id] = True
|
| 50 |
+
|
| 51 |
+
model = cp_model.CpModel()
|
| 52 |
+
|
| 53 |
+
# y[p,s] only for active; store as dict
|
| 54 |
+
y = {}
|
| 55 |
+
for p in range(P):
|
| 56 |
+
for s in range(S):
|
| 57 |
+
if active[p][s]:
|
| 58 |
+
y[(p, s)] = model.NewIntVar(0, max_lanes - 1, f"y_p{p}_s{s}")
|
| 59 |
+
|
| 60 |
+
# AllDifferent por segmento (solo activos)
|
| 61 |
+
for s in range(S):
|
| 62 |
+
vars_s = [y[(p, s)] for p in range(P) if (p, s) in y]
|
| 63 |
+
if len(vars_s) >= 2:
|
| 64 |
+
model.AddAllDifferent(vars_s)
|
| 65 |
+
|
| 66 |
+
# wiggle: |y(p,s)-y(p,s-1)|
|
| 67 |
+
wiggle_terms = []
|
| 68 |
+
for p in range(P):
|
| 69 |
+
for s in range(1, S):
|
| 70 |
+
if (p, s) in y and (p, s - 1) in y:
|
| 71 |
+
d = model.NewIntVar(0, max_lanes, f"wiggle_p{p}_s{s}")
|
| 72 |
+
model.AddAbsEquality(d, y[(p, s)] - y[(p, s - 1)])
|
| 73 |
+
wiggle_terms.append(d)
|
| 74 |
+
|
| 75 |
+
# pair ordering vars above[p,q,s] for pairs that are active in segment s
|
| 76 |
+
# limit pairs: only those with interaction weight > 0
|
| 77 |
+
interesting_pairs: Set[Tuple[int, int]] = set()
|
| 78 |
+
for (a, b), w in interactions_edges.items():
|
| 79 |
+
if w <= 0:
|
| 80 |
+
continue
|
| 81 |
+
pa, pb = sp_idx.get(a), sp_idx.get(b)
|
| 82 |
+
if pa is None or pb is None or pa == pb:
|
| 83 |
+
continue
|
| 84 |
+
if pa < pb:
|
| 85 |
+
interesting_pairs.add((pa, pb))
|
| 86 |
+
else:
|
| 87 |
+
interesting_pairs.add((pb, pa))
|
| 88 |
+
|
| 89 |
+
above = {}
|
| 90 |
+
for (p, q) in interesting_pairs:
|
| 91 |
+
for s in range(S):
|
| 92 |
+
if (p, s) in y and (q, s) in y:
|
| 93 |
+
b = model.NewBoolVar(f"above_p{p}_q{q}_s{s}") # p above q
|
| 94 |
+
above[(p, q, s)] = b
|
| 95 |
+
# b -> y[p] +1 <= y[q]
|
| 96 |
+
model.Add(y[(p, s)] + 1 <= y[(q, s)]).OnlyEnforceIf(b)
|
| 97 |
+
# not b -> y[q] +1 <= y[p]
|
| 98 |
+
model.Add(y[(q, s)] + 1 <= y[(p, s)]).OnlyEnforceIf(b.Not())
|
| 99 |
+
|
| 100 |
+
# inversions: inv = |above_s - above_{s-1}| (penaliza cambios de orden)
|
| 101 |
+
inv_terms = []
|
| 102 |
+
for (p, q) in interesting_pairs:
|
| 103 |
+
for s in range(1, S):
|
| 104 |
+
k1 = (p, q, s)
|
| 105 |
+
k0 = (p, q, s - 1)
|
| 106 |
+
if k1 in above and k0 in above:
|
| 107 |
+
inv = model.NewBoolVar(f"inv_p{p}_q{q}_s{s}")
|
| 108 |
+
# inv == abs(b1-b0)
|
| 109 |
+
b1, b0 = above[k1], above[k0]
|
| 110 |
+
# linearización: inv >= b1-b0 ; inv >= b0-b1 ; inv <= b1+b0 ; inv <= 2-(b1+b0)
|
| 111 |
+
model.Add(inv >= b1 - b0)
|
| 112 |
+
model.Add(inv >= b0 - b1)
|
| 113 |
+
model.Add(inv <= b1 + b0)
|
| 114 |
+
model.Add(inv <= 2 - (b1 + b0))
|
| 115 |
+
inv_terms.append(inv)
|
| 116 |
+
|
| 117 |
+
# closeness: si interactúan fuerte, penaliza distancia en segmentos donde ambos están activos
|
| 118 |
+
close_terms = []
|
| 119 |
+
close_weights = []
|
| 120 |
+
for (a, b), w in interactions_edges.items():
|
| 121 |
+
if w <= 0:
|
| 122 |
+
continue
|
| 123 |
+
pa, pb = sp_idx.get(a), sp_idx.get(b)
|
| 124 |
+
if pa is None or pb is None or pa == pb:
|
| 125 |
+
continue
|
| 126 |
+
for s in range(S):
|
| 127 |
+
if (pa, s) in y and (pb, s) in y:
|
| 128 |
+
d = model.NewIntVar(0, max_lanes, f"dist_{pa}_{pb}_s{s}")
|
| 129 |
+
model.AddAbsEquality(d, y[(pa, s)] - y[(pb, s)])
|
| 130 |
+
close_terms.append(d)
|
| 131 |
+
close_weights.append(float(w))
|
| 132 |
+
|
| 133 |
+
# objective
|
| 134 |
+
obj = []
|
| 135 |
+
# wiggle weight
|
| 136 |
+
obj += [2 * t for t in wiggle_terms]
|
| 137 |
+
# inversions weight
|
| 138 |
+
obj += [4 * t for t in inv_terms]
|
| 139 |
+
# closeness weighted (scale)
|
| 140 |
+
for t, w in zip(close_terms, close_weights):
|
| 141 |
+
# CP-SAT necesita ints; escalamos peso
|
| 142 |
+
obj.append(int(min(20.0, 1.0 + w)) * t)
|
| 143 |
+
|
| 144 |
+
model.Minimize(sum(obj))
|
| 145 |
+
|
| 146 |
+
solver = cp_model.CpSolver()
|
| 147 |
+
solver.parameters.max_time_in_seconds = float(settings.ilp_time_limit_s)
|
| 148 |
+
solver.parameters.num_search_workers = 8
|
| 149 |
+
|
| 150 |
+
status = solver.Solve(model)
|
| 151 |
+
|
| 152 |
+
if status not in (cp_model.OPTIMAL, cp_model.FEASIBLE):
|
| 153 |
+
return metrics.assign(y=np.nan, y_smooth=np.nan, line_width=1.0)
|
| 154 |
+
|
| 155 |
+
# construir y por speaker y segmento
|
| 156 |
+
y_seg = { (speakers[p], s): float(solver.Value(y[(p, s)])) for (p, s) in y.keys() }
|
| 157 |
+
|
| 158 |
+
# map y a cada mensaje por su segmento
|
| 159 |
+
out = metrics.copy()
|
| 160 |
+
out["y"] = np.nan
|
| 161 |
+
for seg_id, start, end in segments:
|
| 162 |
+
mask = (out["message_id"] >= start) & (out["message_id"] < end)
|
| 163 |
+
for sp in speakers:
|
| 164 |
+
m2 = mask & (out["speaker"] == sp)
|
| 165 |
+
if m2.any() and (sp, seg_id) in y_seg:
|
| 166 |
+
out.loc[m2, "y"] = y_seg[(sp, seg_id)]
|
| 167 |
+
|
| 168 |
+
out["y_smooth"] = out["y"] # podrías suavizar aquí si quieres
|
| 169 |
+
out["line_width"] = 1.0
|
| 170 |
+
return out
|
conversation_storyline/openai_refiner.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from typing import Dict, Any, List, Optional
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _client():
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
return OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def pick_reply_to_openai(
|
| 14 |
+
model_name: str,
|
| 15 |
+
target: Dict[str, Any],
|
| 16 |
+
candidates: List[Dict[str, Any]],
|
| 17 |
+
) -> Dict[str, Any]:
|
| 18 |
+
"""
|
| 19 |
+
Selecciona reply_to_id solo entre candidatos top-K.
|
| 20 |
+
Devuelve JSON con reply_to_id|None + confidence.
|
| 21 |
+
"""
|
| 22 |
+
sys = (
|
| 23 |
+
"Eres un clasificador preciso de 'reply_to' en conversaciones.\n"
|
| 24 |
+
"Debes elegir el message_id al que responde el target, SOLO entre los candidatos.\n"
|
| 25 |
+
"Si ninguno encaja, devuelve reply_to_id=null.\n"
|
| 26 |
+
"Devuelve SOLO JSON válido."
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
user = {
|
| 30 |
+
"task": "Pick reply_to_id for target among candidates.",
|
| 31 |
+
"target": target,
|
| 32 |
+
"candidates": candidates,
|
| 33 |
+
"output_schema": {"reply_to_id": "int|null", "confidence": "0..1"},
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
resp = _client().chat.completions.create(
|
| 37 |
+
model=model_name,
|
| 38 |
+
messages=[
|
| 39 |
+
{"role": "system", "content": sys},
|
| 40 |
+
{"role": "user", "content": json.dumps(user, ensure_ascii=False)},
|
| 41 |
+
],
|
| 42 |
+
temperature=0.0,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
txt = resp.choices[0].message.content.strip()
|
| 46 |
+
try:
|
| 47 |
+
data = json.loads(txt)
|
| 48 |
+
if "reply_to_id" not in data:
|
| 49 |
+
return {"reply_to_id": None, "confidence": 0.2}
|
| 50 |
+
return data
|
| 51 |
+
except Exception:
|
| 52 |
+
return {"reply_to_id": None, "confidence": 0.2}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def label_topics_openai(
|
| 56 |
+
model_name: str,
|
| 57 |
+
segments: List[Dict[str, Any]],
|
| 58 |
+
) -> List[Dict[str, Any]]:
|
| 59 |
+
"""
|
| 60 |
+
Etiqueta temas por segmento:
|
| 61 |
+
input: [{segment_id, sample_messages:[...]}]
|
| 62 |
+
output: [{segment_id, topic_label}]
|
| 63 |
+
"""
|
| 64 |
+
sys = (
|
| 65 |
+
"Eres un analista de conversaciones. Etiqueta cada segmento con un topic_label corto (3-6 palabras).\n"
|
| 66 |
+
"Devuelve SOLO JSON válido: lista de objetos {segment_id:int, topic_label:str}."
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
resp = _client().chat.completions.create(
|
| 70 |
+
model=model_name,
|
| 71 |
+
messages=[
|
| 72 |
+
{"role": "system", "content": sys},
|
| 73 |
+
{"role": "user", "content": json.dumps(segments, ensure_ascii=False)},
|
| 74 |
+
],
|
| 75 |
+
temperature=0.2,
|
| 76 |
+
)
|
| 77 |
+
txt = resp.choices[0].message.content.strip()
|
| 78 |
+
try:
|
| 79 |
+
return json.loads(txt)
|
| 80 |
+
except Exception:
|
| 81 |
+
# fallback: sin etiquetas
|
| 82 |
+
return [{"segment_id": s["segment_id"], "topic_label": f"Tema {s['segment_id']}"} for s in segments]
|
conversation_storyline/pipeline.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Dict, Any, Tuple
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import numpy as np
|
| 8 |
+
|
| 9 |
+
from .config import settings, has_openai_key
|
| 10 |
+
from .io import parse_transcript
|
| 11 |
+
from .embeddings import Embedder
|
| 12 |
+
from .reply_to import assign_reply_to_offline, refine_reply_to_with_openai
|
| 13 |
+
from .topic_shifts import detect_topic_shifts_ensemble, build_segments, assign_topics_basic
|
| 14 |
+
from .layout_ilp import compute_storyline_layout_ilp
|
| 15 |
+
from .layout_heuristic import compute_layout_heuristic
|
| 16 |
+
from .render import render_storyline_png
|
| 17 |
+
from .plots import (
|
| 18 |
+
plot_reply_distance_hist,
|
| 19 |
+
plot_interaction_heatmap,
|
| 20 |
+
plot_bump_activity,
|
| 21 |
+
plot_sankey_speaker_to_topic,
|
| 22 |
+
plot_sentiment_placeholder,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _build_metrics(interactions) -> pd.DataFrame:
|
| 27 |
+
rows = []
|
| 28 |
+
for it in interactions:
|
| 29 |
+
rows.append(
|
| 30 |
+
{
|
| 31 |
+
"message_id": it.message_id,
|
| 32 |
+
"speaker": it.speaker,
|
| 33 |
+
"text": it.text,
|
| 34 |
+
"reply_to_id": it.reply_to_id,
|
| 35 |
+
"topic_id": it.topic_id,
|
| 36 |
+
"topic_label": it.topic_label,
|
| 37 |
+
"sentiment": it.sentiment,
|
| 38 |
+
"confidence_reply": it.confidence_reply,
|
| 39 |
+
}
|
| 40 |
+
)
|
| 41 |
+
df = pd.DataFrame(rows)
|
| 42 |
+
# reply distance
|
| 43 |
+
df["reply_distance"] = df.apply(
|
| 44 |
+
lambda r: (r["message_id"] - r["reply_to_id"]) if pd.notna(r["reply_to_id"]) else np.nan,
|
| 45 |
+
axis=1,
|
| 46 |
+
)
|
| 47 |
+
return df
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _interaction_matrix(metrics: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[Tuple[str, str], float]]:
|
| 51 |
+
"""
|
| 52 |
+
Matriz from->to por reply_to.
|
| 53 |
+
También devuelve edges globales (speaker_a,speaker_b)->weight para layout.
|
| 54 |
+
"""
|
| 55 |
+
sp = sorted(metrics["speaker"].unique().tolist())
|
| 56 |
+
mat = pd.DataFrame(0, index=sp, columns=sp, dtype=int)
|
| 57 |
+
edges = {}
|
| 58 |
+
|
| 59 |
+
for _, r in metrics.iterrows():
|
| 60 |
+
if pd.isna(r["reply_to_id"]):
|
| 61 |
+
continue
|
| 62 |
+
rid = int(r["reply_to_id"])
|
| 63 |
+
if rid < 0 or rid >= len(metrics):
|
| 64 |
+
continue
|
| 65 |
+
src = r["speaker"]
|
| 66 |
+
dst = metrics.loc[rid, "speaker"]
|
| 67 |
+
mat.loc[src, dst] += 1
|
| 68 |
+
key = (src, dst)
|
| 69 |
+
edges[key] = edges.get(key, 0.0) + 1.0
|
| 70 |
+
# simetrizar un poco para “cercanía” (no dirección)
|
| 71 |
+
key2 = (dst, src)
|
| 72 |
+
edges[key2] = edges.get(key2, 0.0) + 0.6
|
| 73 |
+
|
| 74 |
+
return mat, edges
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def run_pipeline_from_text(
|
| 78 |
+
transcript_text: str,
|
| 79 |
+
out_dir: Path,
|
| 80 |
+
openai_model: str = "none (offline)",
|
| 81 |
+
) -> Dict[str, Any]:
|
| 82 |
+
out_dir = Path(out_dir)
|
| 83 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 84 |
+
|
| 85 |
+
# 1) parse
|
| 86 |
+
interactions = parse_transcript(transcript_text)
|
| 87 |
+
if not interactions:
|
| 88 |
+
raise ValueError("No se detectaron líneas tipo 'Speaker: texto'.")
|
| 89 |
+
|
| 90 |
+
# 2) embeddings
|
| 91 |
+
embedder = Embedder()
|
| 92 |
+
E = embedder.encode([it.text for it in interactions])
|
| 93 |
+
|
| 94 |
+
# 3) reply_to offline + refine opcional
|
| 95 |
+
assign_reply_to_offline(interactions, E)
|
| 96 |
+
if openai_model != "none (offline)" and has_openai_key():
|
| 97 |
+
# Refinamiento selectivo (no refina todo)
|
| 98 |
+
refine_reply_to_with_openai(interactions, E, model_name=openai_model, max_refines=None)
|
| 99 |
+
|
| 100 |
+
# 4) topic shifts (ensemble)
|
| 101 |
+
shifts = detect_topic_shifts_ensemble(E)
|
| 102 |
+
segments = build_segments(len(interactions), shifts)
|
| 103 |
+
assign_topics_basic(interactions, segments)
|
| 104 |
+
|
| 105 |
+
# 5) metrics
|
| 106 |
+
metrics = _build_metrics(interactions)
|
| 107 |
+
|
| 108 |
+
# 6) interacción matrix + edges
|
| 109 |
+
inter_mat, edges = _interaction_matrix(metrics)
|
| 110 |
+
|
| 111 |
+
# 7) layout: ILP si cabe, si no heurístico
|
| 112 |
+
metrics["topic_id"] = metrics["topic_id"].fillna(0).astype(int)
|
| 113 |
+
if (metrics["speaker"].nunique() <= settings.ilp_max_participants) and (len(segments) <= settings.ilp_max_segments):
|
| 114 |
+
laid = compute_storyline_layout_ilp(metrics, segments, edges)
|
| 115 |
+
if laid["y"].isna().all():
|
| 116 |
+
laid = compute_layout_heuristic(metrics)
|
| 117 |
+
else:
|
| 118 |
+
laid = compute_layout_heuristic(metrics)
|
| 119 |
+
|
| 120 |
+
# 8) render storyline
|
| 121 |
+
storyline_png = out_dir / "storyline.png"
|
| 122 |
+
render_storyline_png(laid, str(storyline_png), title="Dinámica Narrativa (Storyline)")
|
| 123 |
+
|
| 124 |
+
# 9) figs
|
| 125 |
+
fig_hist = plot_reply_distance_hist(metrics)
|
| 126 |
+
fig_heat = plot_interaction_heatmap(inter_mat)
|
| 127 |
+
fig_bump = plot_bump_activity(metrics)
|
| 128 |
+
fig_sankey = plot_sankey_speaker_to_topic(metrics)
|
| 129 |
+
fig_sent = plot_sentiment_placeholder(metrics)
|
| 130 |
+
|
| 131 |
+
# 10) export artifacts
|
| 132 |
+
metrics_csv = out_dir / "metrics.csv"
|
| 133 |
+
metrics.to_csv(metrics_csv, index=False, encoding="utf-8")
|
| 134 |
+
|
| 135 |
+
interactions_jsonl = out_dir / "interactions.jsonl"
|
| 136 |
+
with interactions_jsonl.open("w", encoding="utf-8") as f:
|
| 137 |
+
for it in interactions:
|
| 138 |
+
f.write(json.dumps(it.__dict__, ensure_ascii=False) + "\n")
|
| 139 |
+
|
| 140 |
+
graph_json = out_dir / "graph.json"
|
| 141 |
+
graph_json.write_text(
|
| 142 |
+
json.dumps(
|
| 143 |
+
{
|
| 144 |
+
"nodes": [{"id": int(r["message_id"]), "speaker": r["speaker"]} for _, r in metrics.iterrows()],
|
| 145 |
+
"links": [
|
| 146 |
+
{"source": int(r["message_id"]), "target": int(r["reply_to_id"])}
|
| 147 |
+
for _, r in metrics.iterrows()
|
| 148 |
+
if pd.notna(r["reply_to_id"])
|
| 149 |
+
],
|
| 150 |
+
},
|
| 151 |
+
ensure_ascii=False,
|
| 152 |
+
indent=2,
|
| 153 |
+
),
|
| 154 |
+
encoding="utf-8",
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# storyline html embebible (simple): imagen + links
|
| 158 |
+
storyline_html = f"""
|
| 159 |
+
<div style="font-family: system-ui; line-height: 1.35">
|
| 160 |
+
<h3>Storyline</h3>
|
| 161 |
+
<p><b>Speakers:</b> {metrics["speaker"].nunique()} | <b>Mensajes:</b> {len(metrics)}</p>
|
| 162 |
+
<img src="file/{storyline_png.name}" style="max-width: 100%; border-radius: 12px;" />
|
| 163 |
+
<p style="opacity:0.8">Archivos generados en: {out_dir}</p>
|
| 164 |
+
</div>
|
| 165 |
+
"""
|
| 166 |
+
|
| 167 |
+
# summary
|
| 168 |
+
summary = (
|
| 169 |
+
f"Speakers: {metrics['speaker'].nunique()} | Mensajes: {len(metrics)}\n"
|
| 170 |
+
f"Topic segments: {len(segments)} | Shifts detectados: {len(shifts)}\n"
|
| 171 |
+
f"Reply_to NULL: {int(metrics['reply_to_id'].isna().sum())}\n"
|
| 172 |
+
f"Media distancia reply_to: {metrics['reply_distance'].dropna().mean():.2f}\n"
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
return {
|
| 176 |
+
"storyline_png": storyline_png,
|
| 177 |
+
"storyline_html": storyline_html,
|
| 178 |
+
"metrics_csv": metrics_csv,
|
| 179 |
+
"interactions_jsonl": interactions_jsonl,
|
| 180 |
+
"graph_json": graph_json,
|
| 181 |
+
"summary_text": summary,
|
| 182 |
+
"fig_sentiment": fig_sent,
|
| 183 |
+
"fig_bump": fig_bump,
|
| 184 |
+
"fig_heatmap": fig_heat,
|
| 185 |
+
"fig_hist_reply_dist": fig_hist,
|
| 186 |
+
"fig_sankey": fig_sankey,
|
| 187 |
+
}
|
conversation_storyline/plots.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Dict, List, Tuple
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import plotly.graph_objects as go
|
| 7 |
+
import networkx as nx
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def plot_reply_distance_hist(metrics: pd.DataFrame) -> go.Figure:
|
| 11 |
+
d = metrics["reply_distance"].dropna().astype(int)
|
| 12 |
+
fig = go.Figure()
|
| 13 |
+
fig.add_histogram(x=d, nbinsx=40)
|
| 14 |
+
fig.update_layout(title="Distribución distancia reply_to (message_id - reply_to_id)", xaxis_title="distancia", yaxis_title="conteo")
|
| 15 |
+
return fig
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def plot_interaction_heatmap(inter_matrix: pd.DataFrame) -> go.Figure:
|
| 19 |
+
fig = go.Figure(data=go.Heatmap(z=inter_matrix.values, x=inter_matrix.columns, y=inter_matrix.index))
|
| 20 |
+
fig.update_layout(title="Heatmap interacciones (conteo respuestas)", xaxis_title="to", yaxis_title="from")
|
| 21 |
+
return fig
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def plot_bump_activity(metrics: pd.DataFrame) -> go.Figure:
|
| 25 |
+
# actividad por topic_id y speaker
|
| 26 |
+
if "topic_id" not in metrics.columns:
|
| 27 |
+
fig = go.Figure()
|
| 28 |
+
fig.update_layout(title="Bump actividad (no topic_id)")
|
| 29 |
+
return fig
|
| 30 |
+
|
| 31 |
+
piv = metrics.pivot_table(index="topic_id", columns="speaker", values="message_id", aggfunc="count", fill_value=0)
|
| 32 |
+
# rank por segmento (mayor actividad = rank 1)
|
| 33 |
+
ranks = piv.rank(axis=1, method="average", ascending=False)
|
| 34 |
+
|
| 35 |
+
fig = go.Figure()
|
| 36 |
+
for sp in piv.columns:
|
| 37 |
+
fig.add_trace(go.Scatter(x=piv.index, y=ranks[sp], mode="lines+markers", name=sp))
|
| 38 |
+
fig.update_layout(
|
| 39 |
+
title="Bump chart: ranking actividad por segmento",
|
| 40 |
+
xaxis_title="topic_id",
|
| 41 |
+
yaxis_title="rank (1 = más activo)",
|
| 42 |
+
yaxis_autorange="reversed",
|
| 43 |
+
)
|
| 44 |
+
return fig
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def plot_sankey_speaker_to_topic(metrics: pd.DataFrame) -> go.Figure:
|
| 48 |
+
if "topic_label" not in metrics.columns:
|
| 49 |
+
fig = go.Figure()
|
| 50 |
+
fig.update_layout(title="Sankey (no topic_label)")
|
| 51 |
+
return fig
|
| 52 |
+
|
| 53 |
+
speakers = metrics["speaker"].unique().tolist()
|
| 54 |
+
topics = metrics["topic_label"].fillna("Tema").unique().tolist()
|
| 55 |
+
|
| 56 |
+
s_idx = {s: i for i, s in enumerate(speakers)}
|
| 57 |
+
t_idx = {t: i + len(speakers) for i, t in enumerate(topics)}
|
| 58 |
+
|
| 59 |
+
links = metrics.groupby(["speaker", "topic_label"])["message_id"].count().reset_index()
|
| 60 |
+
source = [s_idx[r["speaker"]] for _, r in links.iterrows()]
|
| 61 |
+
target = [t_idx[r["topic_label"]] for _, r in links.iterrows()]
|
| 62 |
+
value = links["message_id"].tolist()
|
| 63 |
+
|
| 64 |
+
labels = speakers + topics
|
| 65 |
+
|
| 66 |
+
fig = go.Figure(
|
| 67 |
+
data=[
|
| 68 |
+
go.Sankey(
|
| 69 |
+
node=dict(label=labels, pad=10, thickness=12),
|
| 70 |
+
link=dict(source=source, target=target, value=value),
|
| 71 |
+
)
|
| 72 |
+
]
|
| 73 |
+
)
|
| 74 |
+
fig.update_layout(title="Sankey: Speaker → Topic (volumen de mensajes)")
|
| 75 |
+
return fig
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def plot_sentiment_placeholder(metrics: pd.DataFrame) -> go.Figure:
|
| 79 |
+
fig = go.Figure()
|
| 80 |
+
if "sentiment" in metrics.columns and metrics["sentiment"].notna().any():
|
| 81 |
+
fig.add_trace(go.Scatter(x=metrics["message_id"], y=metrics["sentiment"], mode="lines+markers"))
|
| 82 |
+
fig.update_layout(title="Sentiment (si disponible)", xaxis_title="message_id", yaxis_title="sentiment")
|
| 83 |
+
return fig
|
conversation_storyline/render.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import matplotlib.pyplot as plt
|
| 7 |
+
|
| 8 |
+
from .config import settings
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def render_storyline_png(metrics: pd.DataFrame, out_png: str, title: str = "Storyline"):
|
| 12 |
+
"""
|
| 13 |
+
Render estilo “xkcd-level” sencillo:
|
| 14 |
+
- x = message_id
|
| 15 |
+
- y = lane (por speaker)
|
| 16 |
+
"""
|
| 17 |
+
if metrics.empty:
|
| 18 |
+
fig = plt.figure(figsize=(12, 4))
|
| 19 |
+
plt.title(title)
|
| 20 |
+
plt.text(0.5, 0.5, "No data", ha="center", va="center")
|
| 21 |
+
plt.axis("off")
|
| 22 |
+
fig.savefig(out_png, dpi=settings.storyline_dpi, bbox_inches="tight")
|
| 23 |
+
plt.close(fig)
|
| 24 |
+
return
|
| 25 |
+
|
| 26 |
+
fig = plt.figure(figsize=(14, 6))
|
| 27 |
+
plt.title(title)
|
| 28 |
+
|
| 29 |
+
for speaker, g in metrics.groupby("speaker"):
|
| 30 |
+
g = g.sort_values("message_id")
|
| 31 |
+
x = g["message_id"].to_numpy()
|
| 32 |
+
y = g["y"].to_numpy()
|
| 33 |
+
|
| 34 |
+
# segmentos: rompe donde y es nan
|
| 35 |
+
ok = ~np.isnan(y)
|
| 36 |
+
if ok.sum() < 2:
|
| 37 |
+
continue
|
| 38 |
+
|
| 39 |
+
plt.plot(x[ok], y[ok], linewidth=2.0, alpha=0.9)
|
| 40 |
+
|
| 41 |
+
plt.yticks([])
|
| 42 |
+
plt.xlabel("message_id")
|
| 43 |
+
plt.tight_layout()
|
| 44 |
+
fig.savefig(out_png, dpi=settings.storyline_dpi, bbox_inches="tight")
|
| 45 |
+
plt.close(fig)
|
conversation_storyline/reply_to.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List, Optional, Tuple, Dict
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
from .schemas import Interaction
|
| 7 |
+
from .config import settings
|
| 8 |
+
from .embeddings import cosine_sim_matrix
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def topk_reply_candidates(
|
| 12 |
+
i: int,
|
| 13 |
+
interactions: List[Interaction],
|
| 14 |
+
E: np.ndarray,
|
| 15 |
+
window: int,
|
| 16 |
+
top_k: int,
|
| 17 |
+
) -> List[Tuple[int, float]]:
|
| 18 |
+
"""
|
| 19 |
+
Devuelve lista (candidate_id, sim) para mensaje i mirando hacia atrás.
|
| 20 |
+
Heurística:
|
| 21 |
+
- Ventana de últimos `window`
|
| 22 |
+
- cosine sim embedding (E ya normalizado)
|
| 23 |
+
- incluye también: msg anterior sí o sí
|
| 24 |
+
"""
|
| 25 |
+
if i <= 0:
|
| 26 |
+
return []
|
| 27 |
+
|
| 28 |
+
start = max(0, i - window)
|
| 29 |
+
cand_ids = list(range(start, i))
|
| 30 |
+
|
| 31 |
+
sims = cosine_sim_matrix(E[cand_ids], E[i])
|
| 32 |
+
order = np.argsort(-sims)
|
| 33 |
+
|
| 34 |
+
top = [(cand_ids[idx], float(sims[idx])) for idx in order[:top_k]]
|
| 35 |
+
|
| 36 |
+
# asegurar que el mensaje inmediatamente anterior esté
|
| 37 |
+
prev = i - 1
|
| 38 |
+
if prev not in [c for c, _ in top]:
|
| 39 |
+
prev_sim = float(cosine_sim_matrix(E[[prev]], E[i])[0])
|
| 40 |
+
top.append((prev, prev_sim))
|
| 41 |
+
top.sort(key=lambda x: x[1], reverse=True)
|
| 42 |
+
|
| 43 |
+
return top[:top_k]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def assign_reply_to_offline(
|
| 47 |
+
interactions: List[Interaction],
|
| 48 |
+
E: np.ndarray,
|
| 49 |
+
) -> None:
|
| 50 |
+
"""
|
| 51 |
+
Asignación offline (sin LLM):
|
| 52 |
+
- elige top1 si sim >= reply_min_sim
|
| 53 |
+
- si no, reply_to=None
|
| 54 |
+
- confidence ~ sim
|
| 55 |
+
"""
|
| 56 |
+
for i in range(len(interactions)):
|
| 57 |
+
if i == 0:
|
| 58 |
+
interactions[i].reply_to_id = None
|
| 59 |
+
interactions[i].confidence_reply = 1.0
|
| 60 |
+
continue
|
| 61 |
+
|
| 62 |
+
top = topk_reply_candidates(
|
| 63 |
+
i=i,
|
| 64 |
+
interactions=interactions,
|
| 65 |
+
E=E,
|
| 66 |
+
window=settings.reply_window,
|
| 67 |
+
top_k=settings.reply_top_k,
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
if not top:
|
| 71 |
+
interactions[i].reply_to_id = None
|
| 72 |
+
interactions[i].confidence_reply = 0.0
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
best_id, best_sim = top[0]
|
| 76 |
+
# ambiguo: si top1-top2 pequeño, offline sigue top1 (LLM puede refinar si se activa)
|
| 77 |
+
if best_sim < settings.reply_min_sim:
|
| 78 |
+
interactions[i].reply_to_id = None
|
| 79 |
+
interactions[i].confidence_reply = float(best_sim)
|
| 80 |
+
else:
|
| 81 |
+
interactions[i].reply_to_id = int(best_id)
|
| 82 |
+
interactions[i].confidence_reply = float(best_sim)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def needs_llm_refine(i: int, top: List[Tuple[int, float]]) -> bool:
|
| 86 |
+
"""
|
| 87 |
+
Decide si un caso merece LLM:
|
| 88 |
+
- baja similitud
|
| 89 |
+
- ambigüedad top1 vs top2
|
| 90 |
+
"""
|
| 91 |
+
if not top:
|
| 92 |
+
return True
|
| 93 |
+
if top[0][1] < settings.reply_min_sim:
|
| 94 |
+
return True
|
| 95 |
+
if len(top) >= 2 and (top[0][1] - top[1][1]) < settings.reply_ambig_delta:
|
| 96 |
+
return True
|
| 97 |
+
return False
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def refine_reply_to_with_openai(
|
| 101 |
+
interactions: List[Interaction],
|
| 102 |
+
E: np.ndarray,
|
| 103 |
+
model_name: str,
|
| 104 |
+
max_refines: Optional[int] = None,
|
| 105 |
+
) -> None:
|
| 106 |
+
"""
|
| 107 |
+
Refinamiento selectivo con OpenAI:
|
| 108 |
+
- Solo en mensajes “ambiguos”
|
| 109 |
+
- Top-K candidatos se pasan al modelo para elegir reply_to_id
|
| 110 |
+
"""
|
| 111 |
+
from .openai_refiner import pick_reply_to_openai
|
| 112 |
+
|
| 113 |
+
if model_name == "none (offline)":
|
| 114 |
+
return
|
| 115 |
+
|
| 116 |
+
refined = 0
|
| 117 |
+
for i in range(1, len(interactions)):
|
| 118 |
+
top = topk_reply_candidates(
|
| 119 |
+
i=i,
|
| 120 |
+
interactions=interactions,
|
| 121 |
+
E=E,
|
| 122 |
+
window=settings.reply_window,
|
| 123 |
+
top_k=settings.reply_top_k,
|
| 124 |
+
)
|
| 125 |
+
if not needs_llm_refine(i, top):
|
| 126 |
+
continue
|
| 127 |
+
|
| 128 |
+
cand_pack = []
|
| 129 |
+
for cid, sim in top:
|
| 130 |
+
cand_pack.append(
|
| 131 |
+
{
|
| 132 |
+
"message_id": int(cid),
|
| 133 |
+
"speaker": interactions[cid].speaker,
|
| 134 |
+
"text": interactions[cid].text[:600],
|
| 135 |
+
"sim": float(sim),
|
| 136 |
+
}
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
picked = pick_reply_to_openai(
|
| 140 |
+
model_name=model_name,
|
| 141 |
+
target={
|
| 142 |
+
"message_id": int(i),
|
| 143 |
+
"speaker": interactions[i].speaker,
|
| 144 |
+
"text": interactions[i].text,
|
| 145 |
+
},
|
| 146 |
+
candidates=cand_pack,
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
interactions[i].reply_to_id = picked.get("reply_to_id", None)
|
| 150 |
+
interactions[i].confidence_reply = float(picked.get("confidence", interactions[i].confidence_reply or 0.0))
|
| 151 |
+
|
| 152 |
+
refined += 1
|
| 153 |
+
if max_refines and refined >= max_refines:
|
| 154 |
+
break
|
conversation_storyline/schemas.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from typing import Optional, Dict, Any
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclass
|
| 6 |
+
class Interaction:
|
| 7 |
+
message_id: int
|
| 8 |
+
speaker: str
|
| 9 |
+
text: str
|
| 10 |
+
|
| 11 |
+
# inferred
|
| 12 |
+
reply_to_id: Optional[int] = None
|
| 13 |
+
topic_id: Optional[int] = None
|
| 14 |
+
topic_label: Optional[str] = None
|
| 15 |
+
|
| 16 |
+
# optional metrics
|
| 17 |
+
sentiment: Optional[float] = None
|
| 18 |
+
confidence_reply: Optional[float] = None
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class PipelineArtifacts:
|
| 23 |
+
out_dir: str
|
| 24 |
+
storyline_png: str
|
| 25 |
+
storyline_html: str
|
| 26 |
+
metrics_csv: str
|
| 27 |
+
interactions_jsonl: str
|
| 28 |
+
graph_json: str
|
| 29 |
+
|
| 30 |
+
summary_text: str
|
| 31 |
+
figs: Dict[str, Any]
|
conversation_storyline/topic_shifts.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import List, Tuple, Dict
|
| 4 |
+
import numpy as np
|
| 5 |
+
import ruptures as rpt
|
| 6 |
+
|
| 7 |
+
from .config import settings
|
| 8 |
+
from .schemas import Interaction
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def detect_topic_shifts_ruptures(E: np.ndarray) -> List[int]:
|
| 12 |
+
"""
|
| 13 |
+
Change-point detection sobre embeddings (normalizados).
|
| 14 |
+
Devuelve índices de mensaje que INICIAN nuevo segmento (excluye 0).
|
| 15 |
+
"""
|
| 16 |
+
n = E.shape[0]
|
| 17 |
+
if n < settings.topic_min_size * 2:
|
| 18 |
+
return []
|
| 19 |
+
|
| 20 |
+
# Distancia: usamos señal en d dims; rpt trabaja con (n, d)
|
| 21 |
+
algo = rpt.Pelt(model="rbf", min_size=settings.topic_min_size).fit(E)
|
| 22 |
+
# Penalización escala con log(n)
|
| 23 |
+
pen = settings.topic_penalty_scale * np.log(max(n, 2))
|
| 24 |
+
bkps = algo.predict(pen=pen) # devuelve endpoints (incluye n)
|
| 25 |
+
|
| 26 |
+
# convertimos endpoints a starts
|
| 27 |
+
starts = []
|
| 28 |
+
prev = 0
|
| 29 |
+
for end in bkps:
|
| 30 |
+
if end >= n:
|
| 31 |
+
break
|
| 32 |
+
if end - prev >= settings.topic_min_size:
|
| 33 |
+
starts.append(end)
|
| 34 |
+
prev = end
|
| 35 |
+
|
| 36 |
+
return starts
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def detect_topic_shifts_ensemble(E: np.ndarray) -> List[int]:
|
| 40 |
+
"""
|
| 41 |
+
Ensemble simple:
|
| 42 |
+
- ruptures
|
| 43 |
+
- caídas fuertes de similitud adyacente
|
| 44 |
+
"""
|
| 45 |
+
shifts = set(detect_topic_shifts_ruptures(E))
|
| 46 |
+
n = E.shape[0]
|
| 47 |
+
if n >= 4:
|
| 48 |
+
adj = np.sum(E[1:] * E[:-1], axis=1) # cosine sim por estar normalizado
|
| 49 |
+
thr = float(np.percentile(adj, 10)) # peor 10% como candidatos
|
| 50 |
+
for i, s in enumerate(adj, start=1):
|
| 51 |
+
if s <= thr:
|
| 52 |
+
shifts.add(i)
|
| 53 |
+
return sorted(x for x in shifts if x > 0 and x < n)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def build_segments(n_messages: int, shift_starts: List[int]) -> List[Tuple[int, int, int]]:
|
| 57 |
+
"""
|
| 58 |
+
Devuelve segmentos como (segment_id, start, end_exclusive)
|
| 59 |
+
"""
|
| 60 |
+
starts = [0] + sorted(set([s for s in shift_starts if 0 < s < n_messages]))
|
| 61 |
+
ends = starts[1:] + [n_messages]
|
| 62 |
+
segs = [(sid, s, e) for sid, (s, e) in enumerate(zip(starts, ends))]
|
| 63 |
+
return segs
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def assign_topics_basic(interactions: List[Interaction], segments: List[Tuple[int, int, int]]) -> None:
|
| 67 |
+
for seg_id, s, e in segments:
|
| 68 |
+
for i in range(s, e):
|
| 69 |
+
interactions[i].topic_id = seg_id
|
| 70 |
+
interactions[i].topic_label = f"Tema {seg_id}"
|
requirements.txt
CHANGED
|
@@ -1,11 +1,13 @@
|
|
| 1 |
-
gradio
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
networkx
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.44.1
|
| 2 |
+
numpy==1.26.4
|
| 3 |
+
pandas==2.2.2
|
| 4 |
+
matplotlib==3.9.0
|
| 5 |
+
plotly==5.22.0
|
| 6 |
+
networkx==3.3
|
| 7 |
+
ruptures==1.1.9
|
| 8 |
+
ortools==9.10.4067
|
| 9 |
+
scikit-learn==1.5.1
|
| 10 |
+
sentence-transformers==3.0.1
|
| 11 |
+
|
| 12 |
+
# Opcional (solo si quieres refinamiento LLM):
|
| 13 |
+
openai==1.40.8
|