Merlintxu commited on
Commit
c119a77
·
verified ·
1 Parent(s): ba9b296

Upload 23 files

Browse files
app.py CHANGED
@@ -1,296 +1,107 @@
1
- import os
2
- import re
3
- import json
4
- import tempfile
5
- from dataclasses import dataclass
6
- from typing import List, Tuple, Dict, Optional
7
-
8
- import gradio as gr
9
- import numpy as np
10
- import pandas as pd
11
- import plotly.express as px
12
- import plotly.graph_objects as go
13
- import matplotlib.pyplot as plt
14
-
15
-
16
- # ----------------------------
17
- # Helpers: parsing transcript
18
- # ----------------------------
19
- SPEAKER_LINE = re.compile(r"^\s*([^:]{1,64})\s*:\s*(.+)\s*$")
20
-
21
- @dataclass
22
- class Msg:
23
- id: int
24
- speaker: str
25
- content: str
26
-
27
-
28
- def parse_transcript(text: str) -> List[Msg]:
29
- """
30
- Very robust parser:
31
- - If lines look like "Speaker: message" => uses speaker
32
- - Otherwise speaker="Unknown"
33
- - Skips empty lines
34
- """
35
- if not text:
36
- return []
37
-
38
- msgs: List[Msg] = []
39
- i = 1
40
- for raw in text.splitlines():
41
- line = raw.strip()
42
- if not line:
43
- continue
44
-
45
- m = SPEAKER_LINE.match(line)
46
- if m:
47
- speaker = m.group(1).strip()
48
- content = m.group(2).strip()
49
- else:
50
- speaker = "Unknown"
51
- content = line
52
-
53
- msgs.append(Msg(id=i, speaker=speaker, content=content))
54
- i += 1
55
-
56
- return msgs
57
-
58
-
59
- # ----------------------------
60
- # Your analysis hook
61
- # ----------------------------
62
- def run_analysis(transcript_text: str, model_name: str):
63
- """
64
- Replace the body of this function with YOUR real pipeline call.
65
- It must return:
66
- interactions: list[dict] with keys at least:
67
- id, speaker, reply_to_id (or None), topic_label, is_topic_shift, sentiment_score
68
- metrics_df: DataFrame with columns: t_start, speaker, centrality (optional), community (optional)
69
- layout_df: DataFrame with columns: t_start, speaker, y or y_smooth
70
- topic_shifts: list[int] message IDs (ensemble)
71
- summary: str
72
- """
73
- # ---- Example stub using your existing output files (if you have them) ----
74
- # If your pipeline already returns these objects, CALL IT here and return them.
75
-
76
- # For now, we build a trivial "interactions" baseline so the UI works.
77
- msgs = parse_transcript(transcript_text)
78
- interactions = []
79
- for m in msgs:
80
- interactions.append(
81
- dict(
82
- id=m.id,
83
- speaker=m.speaker,
84
- reply_to_id=(m.id - 1 if m.id > 1 else None),
85
- topic_label="General",
86
- is_topic_shift=False,
87
- sentiment_score=0.0,
88
- )
89
- )
90
-
91
- # Fake metrics/layout (minimal) so plots render; replace with real metrics/layout from your pipeline.
92
- speakers = sorted({it["speaker"] for it in interactions})
93
- t_starts = list(range(1, max(2, len(interactions)), 5))
94
- rows = []
95
- for t in t_starts:
96
- for s in speakers:
97
- rows.append({"t_start": t, "speaker": s, "centrality": 0.0, "community": 0})
98
- metrics_df = pd.DataFrame(rows)
99
-
100
- # layout: put speakers on fixed lanes
101
- lay_rows = []
102
- for t in t_starts:
103
- for i, s in enumerate(speakers):
104
- lay_rows.append({"t_start": t, "speaker": s, "y": float(i), "y_smooth": float(i)})
105
- layout_df = pd.DataFrame(lay_rows)
106
-
107
- topic_shifts = []
108
- summary = f"Mensajes: {len(interactions)} | Participantes: {len(speakers)} | Modelo: {model_name}"
109
- return interactions, metrics_df, layout_df, topic_shifts, summary
110
-
111
-
112
- # ----------------------------
113
- # Plot builders
114
- # ----------------------------
115
- def render_storyline_png(layout_df: pd.DataFrame, topic_shifts: List[int], out_path: str) -> str:
116
- """
117
- Minimal storyline render:
118
- - x axis: t_start
119
- - y axis: lane position
120
- """
121
- if layout_df is None or layout_df.empty:
122
- # Create an empty placeholder image
123
- fig, ax = plt.subplots(figsize=(10, 3))
124
- ax.text(0.5, 0.5, "No layout data", ha="center", va="center")
125
- ax.axis("off")
126
- fig.savefig(out_path, dpi=160, bbox_inches="tight")
127
- plt.close(fig)
128
- return out_path
129
-
130
- df = layout_df.copy()
131
- if "y_smooth" not in df.columns:
132
- df["y_smooth"] = df["y"]
133
-
134
- fig, ax = plt.subplots(figsize=(14, 6))
135
- for sp, g in df.groupby("speaker", sort=False):
136
- g = g.sort_values("t_start")
137
- ax.plot(g["t_start"], g["y_smooth"], linewidth=1.5)
138
- # label at start
139
- ax.text(g["t_start"].iloc[0], g["y_smooth"].iloc[0], str(sp), fontsize=9)
140
-
141
- # topic shifts as vertical lines (approx: by message id; if your x is t_start, adapt mapping)
142
- for x in topic_shifts:
143
- ax.axvline(x=x, linewidth=1.0, linestyle="--", alpha=0.6)
144
-
145
- ax.set_title("Storyline")
146
- ax.set_xlabel("t")
147
- ax.set_yticks([])
148
- fig.tight_layout()
149
- fig.savefig(out_path, dpi=160, bbox_inches="tight")
150
- plt.close(fig)
151
- return out_path
152
-
153
-
154
- def plot_sentiment(interactions: List[dict]):
155
- df = pd.DataFrame(interactions)
156
- if df.empty:
157
- return go.Figure()
158
- return px.scatter(df, x="id", y="sentiment_score", color="speaker", title="Sentiment timeline")
159
-
160
-
161
- def plot_reply_distance_hist(interactions: List[dict]):
162
- df = pd.DataFrame(interactions)
163
- if df.empty:
164
- return go.Figure()
165
- d = df.dropna(subset=["reply_to_id"]).assign(dist=lambda x: x["id"] - x["reply_to_id"])
166
- return px.histogram(d, x="dist", nbins=50, title="Reply distance histogram")
167
-
168
-
169
- def plot_bump(metrics_df: pd.DataFrame, top_n: int = 20):
170
- if metrics_df is None or metrics_df.empty:
171
- return go.Figure()
172
- avg = metrics_df.groupby("speaker")["centrality"].mean().sort_values(ascending=False).head(top_n)
173
- df = metrics_df[metrics_df["speaker"].isin(avg.index)].copy()
174
- df["rank"] = df.groupby("t_start")["centrality"].rank(ascending=False, method="dense")
175
- fig = px.line(df, x="t_start", y="rank", color="speaker", title=f"Bump chart (Top {top_n})")
176
- fig.update_yaxes(autorange="reversed")
177
- return fig
178
-
179
-
180
- def plot_heatmap(metrics_df: pd.DataFrame, top_n: int = 20):
181
- if metrics_df is None or metrics_df.empty:
182
- return go.Figure()
183
- avg = metrics_df.groupby("speaker")["centrality"].mean().sort_values(ascending=False).head(top_n)
184
- df = metrics_df[metrics_df["speaker"].isin(avg.index)].copy()
185
- piv = df.pivot_table(index="speaker", columns="t_start", values="centrality", aggfunc="mean").fillna(0)
186
- return px.imshow(piv, aspect="auto", title=f"Centrality heatmap (Top {top_n})")
187
-
188
-
189
- def plot_topic_sankey(interactions: List[dict]):
190
- df = pd.DataFrame(interactions)
191
- if df.empty or "topic_label" not in df.columns:
192
- return go.Figure()
193
-
194
- topics = df["topic_label"].astype(str).tolist()
195
- links: Dict[Tuple[str, str], int] = {}
196
- for a, b in zip(topics[:-1], topics[1:]):
197
- if a == b:
198
- continue
199
- links[(a, b)] = links.get((a, b), 0) + 1
200
-
201
- if not links:
202
- return go.Figure()
203
-
204
- nodes = sorted(set([t for ab in links.keys() for t in ab]))
205
- idx = {n: i for i, n in enumerate(nodes)}
206
-
207
- fig = go.Figure(
208
- data=[
209
- go.Sankey(
210
- node=dict(label=nodes),
211
- link=dict(
212
- source=[idx[a] for (a, b) in links.keys()],
213
- target=[idx[b] for (a, b) in links.keys()],
214
- value=list(links.values()),
215
- ),
216
- )
217
- ]
218
- )
219
- fig.update_layout(title="Topic transitions (Sankey)")
220
- return fig
221
-
222
-
223
- # ----------------------------
224
- # Gradio callback (NO global state)
225
- # ----------------------------
226
- def process_transcript(transcript_text: str, model_name: str):
227
- # Run your pipeline (replace run_analysis internals with your real pipeline)
228
- interactions, metrics_df, layout_df, topic_shifts, summary = run_analysis(transcript_text, model_name)
229
-
230
- # ✅ This is where your old code crashed:
231
- # all_interactions was not defined. Here we use `interactions`.
232
- active_participants = sorted({n["speaker"] for n in interactions}) if interactions else []
233
-
234
- # Build image
235
- tmpdir = tempfile.mkdtemp(prefix="storyline_")
236
- img_path = os.path.join(tmpdir, "storyline.png")
237
- render_storyline_png(layout_df, topic_shifts, img_path)
238
-
239
- # Plots
240
- sentiment_fig = plot_sentiment(interactions)
241
- bump_fig = plot_bump(metrics_df, top_n=25)
242
- heatmap_fig = plot_heatmap(metrics_df, top_n=25)
243
- hist_fig = plot_reply_distance_hist(interactions)
244
- sankey_fig = plot_topic_sankey(interactions)
245
-
246
- # Summary text
247
- summary_full = summary + f"\nParticipantes activos: {', '.join(active_participants[:50])}" + (
248
- " ..." if len(active_participants) > 50 else ""
249
- )
250
-
251
- return img_path, summary_full, sentiment_fig, bump_fig, heatmap_fig, hist_fig, sankey_fig
252
-
253
-
254
- # ------------------------------------------------------------------
255
- # Theme + UI (ONLY ONCE)
256
- # ------------------------------------------------------------------
257
- theme = gr.themes.Soft(primary_hue="blue").set(
258
- body_background_fill="*neutral_50",
259
- block_background_fill="*neutral_100",
260
- )
261
-
262
- with gr.Blocks(
263
- title="Conversation Storyline Visualizer – Advanced",
264
- theme=theme, # ✅ theme belongs here (more compatible than demo.launch(theme=...))
265
- ) as demo:
266
- gr.Markdown("# Visualización Narrativa Avanzada de Conversaciones")
267
- gr.Markdown("Soporte para conversaciones largas con chunking + refinamiento reply_to y topic shifts ensemble.")
268
-
269
- with gr.Row():
270
- model_selector = gr.Dropdown(
271
- choices=["gpt-4o-2024-08-06", "gpt-4o-mini-2024-07-18"],
272
- value="gpt-4o-2024-08-06",
273
- label="Modelo OpenAI",
274
- )
275
-
276
- input_text = gr.Textbox(label="Transcripción", lines=20)
277
- btn = gr.Button("Generar Visualizaciones", variant="primary")
278
-
279
- with gr.Tabs():
280
- with gr.Tab("Storyline Principal"):
281
- main_img = gr.Image(label="Storyline (PNG)")
282
- summary_box = gr.Textbox(label="Resumen", lines=6)
283
- with gr.Tab("Análisis Detallado"):
284
- sentiment_plot = gr.Plot(label="Sentimiento")
285
- bump_plot = gr.Plot(label="Ranking (Bump)")
286
- heatmap_plot = gr.Plot(label="Heatmap centralidad")
287
- hist_plot = gr.Plot(label="Hist reply-distance")
288
- sankey_plot = gr.Plot(label="Sankey topics")
289
-
290
- btn.click(
291
- fn=process_transcript,
292
- inputs=[input_text, model_selector],
293
- outputs=[main_img, summary_box, sentiment_plot, bump_plot, heatmap_plot, hist_plot, sankey_plot],
294
- )
295
-
296
- demo.launch()
 
1
+ import os
2
+ import tempfile
3
+ from pathlib import Path
4
+ import gradio as gr
5
+
6
+ from conversation_storyline.pipeline import run_pipeline_from_text
7
+
8
+
9
+ THEME = gr.themes.Soft(primary_hue="blue").set(
10
+ body_background_fill="*neutral_50",
11
+ block_background_fill="*neutral_100",
12
+ )
13
+
14
+ TITLE = "Conversation Storyline Visualizer – v4"
15
+
16
+
17
+ def process_transcript(transcript: str, model_selector: str):
18
+ """
19
+ Entrada: texto pegado (transcripción)
20
+ Salida: storyline.png + resumen + figs plotly
21
+ """
22
+ if not transcript or not transcript.strip():
23
+ raise gr.Error("Pega una transcripción en el cuadro de texto.")
24
+
25
+ outdir = Path(tempfile.mkdtemp(prefix="storyline_v4_"))
26
+
27
+ outputs = run_pipeline_from_text(
28
+ transcript_text=transcript,
29
+ out_dir=outdir,
30
+ openai_model=model_selector,
31
+ )
32
+
33
+ return (
34
+ str(outputs["storyline_png"]),
35
+ outputs["summary_text"],
36
+ outputs["fig_sentiment"],
37
+ outputs["fig_bump"],
38
+ outputs["fig_heatmap"],
39
+ outputs["fig_hist_reply_dist"],
40
+ outputs["fig_sankey"],
41
+ outputs["storyline_html"],
42
+ outputs["metrics_csv"],
43
+ outputs["interactions_jsonl"],
44
+ outputs["graph_json"],
45
+ )
46
+
47
+
48
+ with gr.Blocks(title=TITLE, theme=THEME) as demo:
49
+ gr.Markdown("# Visualización Narrativa Avanzada de Conversaciones (v4)")
50
+ gr.Markdown(
51
+ "- Pega una transcripción tipo `Speaker A: ...`\n"
52
+ "- Soporta conversaciones largas (chunking)\n"
53
+ "- Reply-to Top-K embeddings + topic shifts ruptures + layout OR-Tools\n"
54
+ )
55
+
56
+ with gr.Row():
57
+ model_selector = gr.Dropdown(
58
+ choices=[
59
+ "gpt-4o-2024-08-06",
60
+ "gpt-4o-mini-2024-07-18",
61
+ "none (offline)",
62
+ ],
63
+ value="none (offline)",
64
+ label="Modelo (opcional; si hay OPENAI_API_KEY)",
65
+ )
66
+
67
+ input_text = gr.Textbox(label="Transcripción", lines=20, placeholder="Pega aquí la transcripción...")
68
+ btn = gr.Button("Generar Visualizaciones", variant="primary")
69
+
70
+ with gr.Tabs():
71
+ with gr.Tab("Storyline Principal"):
72
+ main_img = gr.Image(label="Storyline (PNG)")
73
+ summary_box = gr.Textbox(label="Resumen", lines=10)
74
+ storyline_html = gr.HTML(label="Storyline (HTML embebido)")
75
+ with gr.Tab("Análisis Detallado"):
76
+ sentiment_plot = gr.Plot(label="Sentiment (si aplica)")
77
+ bump_plot = gr.Plot(label="Bump actividad por segmento")
78
+ heatmap_plot = gr.Plot(label="Heatmap interacciones")
79
+ hist_plot = gr.Plot(label="Histograma distancia reply_to")
80
+ sankey_plot = gr.Plot(label="Sankey Speaker → Topic")
81
+
82
+ with gr.Tab("Descargas"):
83
+ metrics_csv = gr.File(label="metrics.csv")
84
+ interactions_jsonl = gr.File(label="interactions.jsonl")
85
+ graph_json = gr.File(label="graph.json")
86
+
87
+ btn.click(
88
+ fn=process_transcript,
89
+ inputs=[input_text, model_selector],
90
+ outputs=[
91
+ main_img,
92
+ summary_box,
93
+ sentiment_plot,
94
+ bump_plot,
95
+ heatmap_plot,
96
+ hist_plot,
97
+ sankey_plot,
98
+ storyline_html,
99
+ metrics_csv,
100
+ interactions_jsonl,
101
+ graph_json,
102
+ ],
103
+ )
104
+
105
+ if __name__ == "__main__":
106
+ # HF Spaces
107
+ demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
conversation_storyline/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __all__ = [
2
+ "pipeline",
3
+ "io",
4
+ "schemas",
5
+ "embeddings",
6
+ "reply_to",
7
+ "topic_shifts",
8
+ "layout_ilp",
9
+ "layout_heuristic",
10
+ "render",
11
+ "plots",
12
+ ]
conversation_storyline/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (314 Bytes). View file
 
conversation_storyline/__pycache__/config.cpython-313.pyc ADDED
Binary file (1.49 kB). View file
 
conversation_storyline/__pycache__/embeddings.cpython-313.pyc ADDED
Binary file (3.08 kB). View file
 
conversation_storyline/__pycache__/io.cpython-313.pyc ADDED
Binary file (2.04 kB). View file
 
conversation_storyline/__pycache__/pipeline.cpython-313.pyc ADDED
Binary file (8.41 kB). View file
 
conversation_storyline/__pycache__/reply_to.cpython-313.pyc ADDED
Binary file (5.42 kB). View file
 
conversation_storyline/__pycache__/schemas.cpython-313.pyc ADDED
Binary file (1.42 kB). View file
 
conversation_storyline/__pycache__/topic_shifts.cpython-313.pyc ADDED
Binary file (3.76 kB). View file
 
conversation_storyline/config.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import os
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class Settings:
7
+ # Parsing / ingest
8
+ max_speaker_label_len: int = 64
9
+
10
+ # Reply-to
11
+ reply_window: int = 60 # candidatos previos a considerar
12
+ reply_top_k: int = 10 # top-k por embeddings
13
+ reply_min_sim: float = 0.25 # si top1 < umbral -> puede quedar None (offline)
14
+ reply_ambig_delta: float = 0.03 # si top1-top2 < delta -> candidato a LLM (si hay)
15
+
16
+ # Topic shifts (ruptures)
17
+ topic_min_size: int = 8
18
+ topic_penalty_scale: float = 2.4 # más alto -> menos cortes
19
+
20
+ # Layout (OR-Tools)
21
+ ilp_time_limit_s: float = 6.0
22
+ ilp_max_participants: int = 28 # si más, fallback
23
+ ilp_max_segments: int = 120 # si más, fallback
24
+
25
+ # Rendering
26
+ storyline_dpi: int = 180
27
+
28
+
29
+ settings = Settings()
30
+
31
+
32
+ def has_openai_key() -> bool:
33
+ return bool(os.getenv("OPENAI_API_KEY", "").strip())
conversation_storyline/embeddings.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import List, Optional
5
+ import numpy as np
6
+
7
+
8
+ @dataclass
9
+ class Embedder:
10
+ """
11
+ Embeddings CPU con Sentence-Transformers.
12
+ - Normaliza para cosine-sim rápida.
13
+ - Fallback simple TF-IDF si ST no está disponible.
14
+ """
15
+ model_name: str = "intfloat/multilingual-e5-small"
16
+ _st = None
17
+ _tfidf = None
18
+ _tfidf_vectorizer = None
19
+
20
+ def _load_st(self):
21
+ if self._st is None:
22
+ from sentence_transformers import SentenceTransformer
23
+ self._st = SentenceTransformer(self.model_name)
24
+
25
+ def encode(self, texts: List[str]) -> np.ndarray:
26
+ texts = [t or "" for t in texts]
27
+ try:
28
+ self._load_st()
29
+ # e5: mejor con "passage: " / "query: " pero aquí vale "passage:"
30
+ inp = [("passage: " + t) for t in texts]
31
+ X = np.array(self._st.encode(inp, normalize_embeddings=True, show_progress_bar=False), dtype=np.float32)
32
+ return X
33
+ except Exception:
34
+ # TF-IDF fallback (no semántico perfecto, pero funciona offline “siempre”)
35
+ from sklearn.feature_extraction.text import TfidfVectorizer
36
+ if self._tfidf_vectorizer is None:
37
+ self._tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
38
+ X = self._tfidf_vectorizer.fit_transform(texts).astype(np.float32)
39
+ else:
40
+ X = self._tfidf_vectorizer.transform(texts).astype(np.float32)
41
+ # normalizar filas
42
+ X = X.toarray()
43
+ norms = np.linalg.norm(X, axis=1, keepdims=True) + 1e-9
44
+ return X / norms
45
+
46
+
47
+ def cosine_sim_matrix(A: np.ndarray, b: np.ndarray) -> np.ndarray:
48
+ """
49
+ A: (n, d) normalized
50
+ b: (d,) normalized
51
+ return: (n,)
52
+ """
53
+ return (A @ b).astype(np.float32)
conversation_storyline/io.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List
3
+ from .schemas import Interaction
4
+
5
+
6
+ SPEAKER_PATTERNS = [
7
+ # Speaker A: ...
8
+ re.compile(r"^(?P<speaker>Speaker\s+[A-Za-z0-9_\- ]{1,64})\s*:\s*(?P<text>.+)\s*$"),
9
+ # A: ...
10
+ re.compile(r"^(?P<speaker>[A-Za-zÁÉÍÓÚÜÑáéíóúüñ0-9_\- ]{1,32})\s*:\s*(?P<text>.+)\s*$"),
11
+ ]
12
+
13
+
14
+ def parse_transcript(text: str) -> List[Interaction]:
15
+ """
16
+ Robusto para texto pegado:
17
+ - Cada línea que matchee "SPEAKER: ..." crea nuevo mensaje.
18
+ - Líneas sin speaker se anexan al texto del último mensaje (continuación).
19
+ """
20
+ lines = [l.rstrip() for l in (text or "").splitlines()]
21
+ interactions: List[Interaction] = []
22
+ cur = None
23
+
24
+ for raw in lines:
25
+ line = raw.strip()
26
+ if not line:
27
+ continue
28
+
29
+ matched = None
30
+ for pat in SPEAKER_PATTERNS:
31
+ m = pat.match(line)
32
+ if m:
33
+ matched = m
34
+ break
35
+
36
+ if matched:
37
+ speaker = matched.group("speaker").strip()
38
+ msg = matched.group("text").strip()
39
+ cur = Interaction(message_id=len(interactions), speaker=speaker, text=msg)
40
+ interactions.append(cur)
41
+ else:
42
+ # continuation line
43
+ if cur is None:
44
+ cur = Interaction(message_id=0, speaker="Unknown", text=line)
45
+ interactions.append(cur)
46
+ else:
47
+ cur.text = (cur.text + " " + line).strip()
48
+
49
+ return interactions
conversation_storyline/layout_heuristic.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, List, Tuple
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+
8
+ def compute_layout_heuristic(metrics: pd.DataFrame) -> pd.DataFrame:
9
+ """
10
+ Fallback rápido:
11
+ - ordena speakers por actividad total
12
+ - asigna y fijo por speaker
13
+ """
14
+ if metrics.empty:
15
+ return metrics
16
+
17
+ speakers = (
18
+ metrics.groupby("speaker")["message_id"]
19
+ .count()
20
+ .sort_values(ascending=False)
21
+ .index
22
+ .tolist()
23
+ )
24
+ y_map = {s: i for i, s in enumerate(speakers)}
25
+ out = metrics.copy()
26
+ out["y"] = out["speaker"].map(y_map).astype(float)
27
+ out["y_smooth"] = out["y"]
28
+ out["line_width"] = 1.0
29
+ return out
conversation_storyline/layout_ilp.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, List, Tuple, Set
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+ from ortools.sat.python import cp_model
8
+
9
+ from .config import settings
10
+
11
+
12
+ def compute_storyline_layout_ilp(
13
+ metrics: pd.DataFrame,
14
+ segments: List[Tuple[int, int, int]],
15
+ interactions_edges: Dict[Tuple[str, str], float],
16
+ ) -> pd.DataFrame:
17
+ """
18
+ Layout por segmentos con CP-SAT minimizando:
19
+ - inversiones de orden entre segmentos (proxy de cruces)
20
+ - wiggle (|y_t - y_{t-1}|)
21
+ - distancia entre speakers que interactúan en el mismo segmento
22
+
23
+ metrics: filas por mensaje con speaker, message_id, topic_id, etc.
24
+ segments: (seg_id, start, end)
25
+ interactions_edges: pesos globales (speaker_a, speaker_b) -> weight
26
+ """
27
+ if metrics.empty:
28
+ return metrics.assign(y=np.nan, y_smooth=np.nan, line_width=1.0)
29
+
30
+ speakers = sorted(metrics["speaker"].unique().tolist())
31
+ P = len(speakers)
32
+ S = len(segments)
33
+
34
+ if P > settings.ilp_max_participants or S > settings.ilp_max_segments:
35
+ # demasiado grande: lo decide pipeline con fallback; aquí por seguridad
36
+ return metrics.assign(y=np.nan, y_smooth=np.nan, line_width=1.0)
37
+
38
+ sp_idx = {s: i for i, s in enumerate(speakers)}
39
+ max_lanes = max(
40
+ metrics.groupby("topic_id")["speaker"].nunique().max(),
41
+ 1
42
+ )
43
+
44
+ # active[p][s] = bool
45
+ active = [[False] * S for _ in range(P)]
46
+ for seg_id, start, end in segments:
47
+ seg_speakers = set(metrics[(metrics["message_id"] >= start) & (metrics["message_id"] < end)]["speaker"])
48
+ for sp in seg_speakers:
49
+ active[sp_idx[sp]][seg_id] = True
50
+
51
+ model = cp_model.CpModel()
52
+
53
+ # y[p,s] only for active; store as dict
54
+ y = {}
55
+ for p in range(P):
56
+ for s in range(S):
57
+ if active[p][s]:
58
+ y[(p, s)] = model.NewIntVar(0, max_lanes - 1, f"y_p{p}_s{s}")
59
+
60
+ # AllDifferent por segmento (solo activos)
61
+ for s in range(S):
62
+ vars_s = [y[(p, s)] for p in range(P) if (p, s) in y]
63
+ if len(vars_s) >= 2:
64
+ model.AddAllDifferent(vars_s)
65
+
66
+ # wiggle: |y(p,s)-y(p,s-1)|
67
+ wiggle_terms = []
68
+ for p in range(P):
69
+ for s in range(1, S):
70
+ if (p, s) in y and (p, s - 1) in y:
71
+ d = model.NewIntVar(0, max_lanes, f"wiggle_p{p}_s{s}")
72
+ model.AddAbsEquality(d, y[(p, s)] - y[(p, s - 1)])
73
+ wiggle_terms.append(d)
74
+
75
+ # pair ordering vars above[p,q,s] for pairs that are active in segment s
76
+ # limit pairs: only those with interaction weight > 0
77
+ interesting_pairs: Set[Tuple[int, int]] = set()
78
+ for (a, b), w in interactions_edges.items():
79
+ if w <= 0:
80
+ continue
81
+ pa, pb = sp_idx.get(a), sp_idx.get(b)
82
+ if pa is None or pb is None or pa == pb:
83
+ continue
84
+ if pa < pb:
85
+ interesting_pairs.add((pa, pb))
86
+ else:
87
+ interesting_pairs.add((pb, pa))
88
+
89
+ above = {}
90
+ for (p, q) in interesting_pairs:
91
+ for s in range(S):
92
+ if (p, s) in y and (q, s) in y:
93
+ b = model.NewBoolVar(f"above_p{p}_q{q}_s{s}") # p above q
94
+ above[(p, q, s)] = b
95
+ # b -> y[p] +1 <= y[q]
96
+ model.Add(y[(p, s)] + 1 <= y[(q, s)]).OnlyEnforceIf(b)
97
+ # not b -> y[q] +1 <= y[p]
98
+ model.Add(y[(q, s)] + 1 <= y[(p, s)]).OnlyEnforceIf(b.Not())
99
+
100
+ # inversions: inv = |above_s - above_{s-1}| (penaliza cambios de orden)
101
+ inv_terms = []
102
+ for (p, q) in interesting_pairs:
103
+ for s in range(1, S):
104
+ k1 = (p, q, s)
105
+ k0 = (p, q, s - 1)
106
+ if k1 in above and k0 in above:
107
+ inv = model.NewBoolVar(f"inv_p{p}_q{q}_s{s}")
108
+ # inv == abs(b1-b0)
109
+ b1, b0 = above[k1], above[k0]
110
+ # linearización: inv >= b1-b0 ; inv >= b0-b1 ; inv <= b1+b0 ; inv <= 2-(b1+b0)
111
+ model.Add(inv >= b1 - b0)
112
+ model.Add(inv >= b0 - b1)
113
+ model.Add(inv <= b1 + b0)
114
+ model.Add(inv <= 2 - (b1 + b0))
115
+ inv_terms.append(inv)
116
+
117
+ # closeness: si interactúan fuerte, penaliza distancia en segmentos donde ambos están activos
118
+ close_terms = []
119
+ close_weights = []
120
+ for (a, b), w in interactions_edges.items():
121
+ if w <= 0:
122
+ continue
123
+ pa, pb = sp_idx.get(a), sp_idx.get(b)
124
+ if pa is None or pb is None or pa == pb:
125
+ continue
126
+ for s in range(S):
127
+ if (pa, s) in y and (pb, s) in y:
128
+ d = model.NewIntVar(0, max_lanes, f"dist_{pa}_{pb}_s{s}")
129
+ model.AddAbsEquality(d, y[(pa, s)] - y[(pb, s)])
130
+ close_terms.append(d)
131
+ close_weights.append(float(w))
132
+
133
+ # objective
134
+ obj = []
135
+ # wiggle weight
136
+ obj += [2 * t for t in wiggle_terms]
137
+ # inversions weight
138
+ obj += [4 * t for t in inv_terms]
139
+ # closeness weighted (scale)
140
+ for t, w in zip(close_terms, close_weights):
141
+ # CP-SAT necesita ints; escalamos peso
142
+ obj.append(int(min(20.0, 1.0 + w)) * t)
143
+
144
+ model.Minimize(sum(obj))
145
+
146
+ solver = cp_model.CpSolver()
147
+ solver.parameters.max_time_in_seconds = float(settings.ilp_time_limit_s)
148
+ solver.parameters.num_search_workers = 8
149
+
150
+ status = solver.Solve(model)
151
+
152
+ if status not in (cp_model.OPTIMAL, cp_model.FEASIBLE):
153
+ return metrics.assign(y=np.nan, y_smooth=np.nan, line_width=1.0)
154
+
155
+ # construir y por speaker y segmento
156
+ y_seg = { (speakers[p], s): float(solver.Value(y[(p, s)])) for (p, s) in y.keys() }
157
+
158
+ # map y a cada mensaje por su segmento
159
+ out = metrics.copy()
160
+ out["y"] = np.nan
161
+ for seg_id, start, end in segments:
162
+ mask = (out["message_id"] >= start) & (out["message_id"] < end)
163
+ for sp in speakers:
164
+ m2 = mask & (out["speaker"] == sp)
165
+ if m2.any() and (sp, seg_id) in y_seg:
166
+ out.loc[m2, "y"] = y_seg[(sp, seg_id)]
167
+
168
+ out["y_smooth"] = out["y"] # podrías suavizar aquí si quieres
169
+ out["line_width"] = 1.0
170
+ return out
conversation_storyline/openai_refiner.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from typing import Dict, Any, List, Optional
6
+
7
+
8
+ def _client():
9
+ from openai import OpenAI
10
+ return OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
11
+
12
+
13
+ def pick_reply_to_openai(
14
+ model_name: str,
15
+ target: Dict[str, Any],
16
+ candidates: List[Dict[str, Any]],
17
+ ) -> Dict[str, Any]:
18
+ """
19
+ Selecciona reply_to_id solo entre candidatos top-K.
20
+ Devuelve JSON con reply_to_id|None + confidence.
21
+ """
22
+ sys = (
23
+ "Eres un clasificador preciso de 'reply_to' en conversaciones.\n"
24
+ "Debes elegir el message_id al que responde el target, SOLO entre los candidatos.\n"
25
+ "Si ninguno encaja, devuelve reply_to_id=null.\n"
26
+ "Devuelve SOLO JSON válido."
27
+ )
28
+
29
+ user = {
30
+ "task": "Pick reply_to_id for target among candidates.",
31
+ "target": target,
32
+ "candidates": candidates,
33
+ "output_schema": {"reply_to_id": "int|null", "confidence": "0..1"},
34
+ }
35
+
36
+ resp = _client().chat.completions.create(
37
+ model=model_name,
38
+ messages=[
39
+ {"role": "system", "content": sys},
40
+ {"role": "user", "content": json.dumps(user, ensure_ascii=False)},
41
+ ],
42
+ temperature=0.0,
43
+ )
44
+
45
+ txt = resp.choices[0].message.content.strip()
46
+ try:
47
+ data = json.loads(txt)
48
+ if "reply_to_id" not in data:
49
+ return {"reply_to_id": None, "confidence": 0.2}
50
+ return data
51
+ except Exception:
52
+ return {"reply_to_id": None, "confidence": 0.2}
53
+
54
+
55
+ def label_topics_openai(
56
+ model_name: str,
57
+ segments: List[Dict[str, Any]],
58
+ ) -> List[Dict[str, Any]]:
59
+ """
60
+ Etiqueta temas por segmento:
61
+ input: [{segment_id, sample_messages:[...]}]
62
+ output: [{segment_id, topic_label}]
63
+ """
64
+ sys = (
65
+ "Eres un analista de conversaciones. Etiqueta cada segmento con un topic_label corto (3-6 palabras).\n"
66
+ "Devuelve SOLO JSON válido: lista de objetos {segment_id:int, topic_label:str}."
67
+ )
68
+
69
+ resp = _client().chat.completions.create(
70
+ model=model_name,
71
+ messages=[
72
+ {"role": "system", "content": sys},
73
+ {"role": "user", "content": json.dumps(segments, ensure_ascii=False)},
74
+ ],
75
+ temperature=0.2,
76
+ )
77
+ txt = resp.choices[0].message.content.strip()
78
+ try:
79
+ return json.loads(txt)
80
+ except Exception:
81
+ # fallback: sin etiquetas
82
+ return [{"segment_id": s["segment_id"], "topic_label": f"Tema {s['segment_id']}"} for s in segments]
conversation_storyline/pipeline.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Dict, Any, Tuple
6
+ import pandas as pd
7
+ import numpy as np
8
+
9
+ from .config import settings, has_openai_key
10
+ from .io import parse_transcript
11
+ from .embeddings import Embedder
12
+ from .reply_to import assign_reply_to_offline, refine_reply_to_with_openai
13
+ from .topic_shifts import detect_topic_shifts_ensemble, build_segments, assign_topics_basic
14
+ from .layout_ilp import compute_storyline_layout_ilp
15
+ from .layout_heuristic import compute_layout_heuristic
16
+ from .render import render_storyline_png
17
+ from .plots import (
18
+ plot_reply_distance_hist,
19
+ plot_interaction_heatmap,
20
+ plot_bump_activity,
21
+ plot_sankey_speaker_to_topic,
22
+ plot_sentiment_placeholder,
23
+ )
24
+
25
+
26
+ def _build_metrics(interactions) -> pd.DataFrame:
27
+ rows = []
28
+ for it in interactions:
29
+ rows.append(
30
+ {
31
+ "message_id": it.message_id,
32
+ "speaker": it.speaker,
33
+ "text": it.text,
34
+ "reply_to_id": it.reply_to_id,
35
+ "topic_id": it.topic_id,
36
+ "topic_label": it.topic_label,
37
+ "sentiment": it.sentiment,
38
+ "confidence_reply": it.confidence_reply,
39
+ }
40
+ )
41
+ df = pd.DataFrame(rows)
42
+ # reply distance
43
+ df["reply_distance"] = df.apply(
44
+ lambda r: (r["message_id"] - r["reply_to_id"]) if pd.notna(r["reply_to_id"]) else np.nan,
45
+ axis=1,
46
+ )
47
+ return df
48
+
49
+
50
+ def _interaction_matrix(metrics: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[Tuple[str, str], float]]:
51
+ """
52
+ Matriz from->to por reply_to.
53
+ También devuelve edges globales (speaker_a,speaker_b)->weight para layout.
54
+ """
55
+ sp = sorted(metrics["speaker"].unique().tolist())
56
+ mat = pd.DataFrame(0, index=sp, columns=sp, dtype=int)
57
+ edges = {}
58
+
59
+ for _, r in metrics.iterrows():
60
+ if pd.isna(r["reply_to_id"]):
61
+ continue
62
+ rid = int(r["reply_to_id"])
63
+ if rid < 0 or rid >= len(metrics):
64
+ continue
65
+ src = r["speaker"]
66
+ dst = metrics.loc[rid, "speaker"]
67
+ mat.loc[src, dst] += 1
68
+ key = (src, dst)
69
+ edges[key] = edges.get(key, 0.0) + 1.0
70
+ # simetrizar un poco para “cercanía” (no dirección)
71
+ key2 = (dst, src)
72
+ edges[key2] = edges.get(key2, 0.0) + 0.6
73
+
74
+ return mat, edges
75
+
76
+
77
+ def run_pipeline_from_text(
78
+ transcript_text: str,
79
+ out_dir: Path,
80
+ openai_model: str = "none (offline)",
81
+ ) -> Dict[str, Any]:
82
+ out_dir = Path(out_dir)
83
+ out_dir.mkdir(parents=True, exist_ok=True)
84
+
85
+ # 1) parse
86
+ interactions = parse_transcript(transcript_text)
87
+ if not interactions:
88
+ raise ValueError("No se detectaron líneas tipo 'Speaker: texto'.")
89
+
90
+ # 2) embeddings
91
+ embedder = Embedder()
92
+ E = embedder.encode([it.text for it in interactions])
93
+
94
+ # 3) reply_to offline + refine opcional
95
+ assign_reply_to_offline(interactions, E)
96
+ if openai_model != "none (offline)" and has_openai_key():
97
+ # Refinamiento selectivo (no refina todo)
98
+ refine_reply_to_with_openai(interactions, E, model_name=openai_model, max_refines=None)
99
+
100
+ # 4) topic shifts (ensemble)
101
+ shifts = detect_topic_shifts_ensemble(E)
102
+ segments = build_segments(len(interactions), shifts)
103
+ assign_topics_basic(interactions, segments)
104
+
105
+ # 5) metrics
106
+ metrics = _build_metrics(interactions)
107
+
108
+ # 6) interacción matrix + edges
109
+ inter_mat, edges = _interaction_matrix(metrics)
110
+
111
+ # 7) layout: ILP si cabe, si no heurístico
112
+ metrics["topic_id"] = metrics["topic_id"].fillna(0).astype(int)
113
+ if (metrics["speaker"].nunique() <= settings.ilp_max_participants) and (len(segments) <= settings.ilp_max_segments):
114
+ laid = compute_storyline_layout_ilp(metrics, segments, edges)
115
+ if laid["y"].isna().all():
116
+ laid = compute_layout_heuristic(metrics)
117
+ else:
118
+ laid = compute_layout_heuristic(metrics)
119
+
120
+ # 8) render storyline
121
+ storyline_png = out_dir / "storyline.png"
122
+ render_storyline_png(laid, str(storyline_png), title="Dinámica Narrativa (Storyline)")
123
+
124
+ # 9) figs
125
+ fig_hist = plot_reply_distance_hist(metrics)
126
+ fig_heat = plot_interaction_heatmap(inter_mat)
127
+ fig_bump = plot_bump_activity(metrics)
128
+ fig_sankey = plot_sankey_speaker_to_topic(metrics)
129
+ fig_sent = plot_sentiment_placeholder(metrics)
130
+
131
+ # 10) export artifacts
132
+ metrics_csv = out_dir / "metrics.csv"
133
+ metrics.to_csv(metrics_csv, index=False, encoding="utf-8")
134
+
135
+ interactions_jsonl = out_dir / "interactions.jsonl"
136
+ with interactions_jsonl.open("w", encoding="utf-8") as f:
137
+ for it in interactions:
138
+ f.write(json.dumps(it.__dict__, ensure_ascii=False) + "\n")
139
+
140
+ graph_json = out_dir / "graph.json"
141
+ graph_json.write_text(
142
+ json.dumps(
143
+ {
144
+ "nodes": [{"id": int(r["message_id"]), "speaker": r["speaker"]} for _, r in metrics.iterrows()],
145
+ "links": [
146
+ {"source": int(r["message_id"]), "target": int(r["reply_to_id"])}
147
+ for _, r in metrics.iterrows()
148
+ if pd.notna(r["reply_to_id"])
149
+ ],
150
+ },
151
+ ensure_ascii=False,
152
+ indent=2,
153
+ ),
154
+ encoding="utf-8",
155
+ )
156
+
157
+ # storyline html embebible (simple): imagen + links
158
+ storyline_html = f"""
159
+ <div style="font-family: system-ui; line-height: 1.35">
160
+ <h3>Storyline</h3>
161
+ <p><b>Speakers:</b> {metrics["speaker"].nunique()} | <b>Mensajes:</b> {len(metrics)}</p>
162
+ <img src="file/{storyline_png.name}" style="max-width: 100%; border-radius: 12px;" />
163
+ <p style="opacity:0.8">Archivos generados en: {out_dir}</p>
164
+ </div>
165
+ """
166
+
167
+ # summary
168
+ summary = (
169
+ f"Speakers: {metrics['speaker'].nunique()} | Mensajes: {len(metrics)}\n"
170
+ f"Topic segments: {len(segments)} | Shifts detectados: {len(shifts)}\n"
171
+ f"Reply_to NULL: {int(metrics['reply_to_id'].isna().sum())}\n"
172
+ f"Media distancia reply_to: {metrics['reply_distance'].dropna().mean():.2f}\n"
173
+ )
174
+
175
+ return {
176
+ "storyline_png": storyline_png,
177
+ "storyline_html": storyline_html,
178
+ "metrics_csv": metrics_csv,
179
+ "interactions_jsonl": interactions_jsonl,
180
+ "graph_json": graph_json,
181
+ "summary_text": summary,
182
+ "fig_sentiment": fig_sent,
183
+ "fig_bump": fig_bump,
184
+ "fig_heatmap": fig_heat,
185
+ "fig_hist_reply_dist": fig_hist,
186
+ "fig_sankey": fig_sankey,
187
+ }
conversation_storyline/plots.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict, List, Tuple
4
+ import numpy as np
5
+ import pandas as pd
6
+ import plotly.graph_objects as go
7
+ import networkx as nx
8
+
9
+
10
+ def plot_reply_distance_hist(metrics: pd.DataFrame) -> go.Figure:
11
+ d = metrics["reply_distance"].dropna().astype(int)
12
+ fig = go.Figure()
13
+ fig.add_histogram(x=d, nbinsx=40)
14
+ fig.update_layout(title="Distribución distancia reply_to (message_id - reply_to_id)", xaxis_title="distancia", yaxis_title="conteo")
15
+ return fig
16
+
17
+
18
+ def plot_interaction_heatmap(inter_matrix: pd.DataFrame) -> go.Figure:
19
+ fig = go.Figure(data=go.Heatmap(z=inter_matrix.values, x=inter_matrix.columns, y=inter_matrix.index))
20
+ fig.update_layout(title="Heatmap interacciones (conteo respuestas)", xaxis_title="to", yaxis_title="from")
21
+ return fig
22
+
23
+
24
+ def plot_bump_activity(metrics: pd.DataFrame) -> go.Figure:
25
+ # actividad por topic_id y speaker
26
+ if "topic_id" not in metrics.columns:
27
+ fig = go.Figure()
28
+ fig.update_layout(title="Bump actividad (no topic_id)")
29
+ return fig
30
+
31
+ piv = metrics.pivot_table(index="topic_id", columns="speaker", values="message_id", aggfunc="count", fill_value=0)
32
+ # rank por segmento (mayor actividad = rank 1)
33
+ ranks = piv.rank(axis=1, method="average", ascending=False)
34
+
35
+ fig = go.Figure()
36
+ for sp in piv.columns:
37
+ fig.add_trace(go.Scatter(x=piv.index, y=ranks[sp], mode="lines+markers", name=sp))
38
+ fig.update_layout(
39
+ title="Bump chart: ranking actividad por segmento",
40
+ xaxis_title="topic_id",
41
+ yaxis_title="rank (1 = más activo)",
42
+ yaxis_autorange="reversed",
43
+ )
44
+ return fig
45
+
46
+
47
+ def plot_sankey_speaker_to_topic(metrics: pd.DataFrame) -> go.Figure:
48
+ if "topic_label" not in metrics.columns:
49
+ fig = go.Figure()
50
+ fig.update_layout(title="Sankey (no topic_label)")
51
+ return fig
52
+
53
+ speakers = metrics["speaker"].unique().tolist()
54
+ topics = metrics["topic_label"].fillna("Tema").unique().tolist()
55
+
56
+ s_idx = {s: i for i, s in enumerate(speakers)}
57
+ t_idx = {t: i + len(speakers) for i, t in enumerate(topics)}
58
+
59
+ links = metrics.groupby(["speaker", "topic_label"])["message_id"].count().reset_index()
60
+ source = [s_idx[r["speaker"]] for _, r in links.iterrows()]
61
+ target = [t_idx[r["topic_label"]] for _, r in links.iterrows()]
62
+ value = links["message_id"].tolist()
63
+
64
+ labels = speakers + topics
65
+
66
+ fig = go.Figure(
67
+ data=[
68
+ go.Sankey(
69
+ node=dict(label=labels, pad=10, thickness=12),
70
+ link=dict(source=source, target=target, value=value),
71
+ )
72
+ ]
73
+ )
74
+ fig.update_layout(title="Sankey: Speaker → Topic (volumen de mensajes)")
75
+ return fig
76
+
77
+
78
+ def plot_sentiment_placeholder(metrics: pd.DataFrame) -> go.Figure:
79
+ fig = go.Figure()
80
+ if "sentiment" in metrics.columns and metrics["sentiment"].notna().any():
81
+ fig.add_trace(go.Scatter(x=metrics["message_id"], y=metrics["sentiment"], mode="lines+markers"))
82
+ fig.update_layout(title="Sentiment (si disponible)", xaxis_title="message_id", yaxis_title="sentiment")
83
+ return fig
conversation_storyline/render.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List
4
+ import numpy as np
5
+ import pandas as pd
6
+ import matplotlib.pyplot as plt
7
+
8
+ from .config import settings
9
+
10
+
11
+ def render_storyline_png(metrics: pd.DataFrame, out_png: str, title: str = "Storyline"):
12
+ """
13
+ Render estilo “xkcd-level” sencillo:
14
+ - x = message_id
15
+ - y = lane (por speaker)
16
+ """
17
+ if metrics.empty:
18
+ fig = plt.figure(figsize=(12, 4))
19
+ plt.title(title)
20
+ plt.text(0.5, 0.5, "No data", ha="center", va="center")
21
+ plt.axis("off")
22
+ fig.savefig(out_png, dpi=settings.storyline_dpi, bbox_inches="tight")
23
+ plt.close(fig)
24
+ return
25
+
26
+ fig = plt.figure(figsize=(14, 6))
27
+ plt.title(title)
28
+
29
+ for speaker, g in metrics.groupby("speaker"):
30
+ g = g.sort_values("message_id")
31
+ x = g["message_id"].to_numpy()
32
+ y = g["y"].to_numpy()
33
+
34
+ # segmentos: rompe donde y es nan
35
+ ok = ~np.isnan(y)
36
+ if ok.sum() < 2:
37
+ continue
38
+
39
+ plt.plot(x[ok], y[ok], linewidth=2.0, alpha=0.9)
40
+
41
+ plt.yticks([])
42
+ plt.xlabel("message_id")
43
+ plt.tight_layout()
44
+ fig.savefig(out_png, dpi=settings.storyline_dpi, bbox_inches="tight")
45
+ plt.close(fig)
conversation_storyline/reply_to.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Optional, Tuple, Dict
4
+ import numpy as np
5
+
6
+ from .schemas import Interaction
7
+ from .config import settings
8
+ from .embeddings import cosine_sim_matrix
9
+
10
+
11
+ def topk_reply_candidates(
12
+ i: int,
13
+ interactions: List[Interaction],
14
+ E: np.ndarray,
15
+ window: int,
16
+ top_k: int,
17
+ ) -> List[Tuple[int, float]]:
18
+ """
19
+ Devuelve lista (candidate_id, sim) para mensaje i mirando hacia atrás.
20
+ Heurística:
21
+ - Ventana de últimos `window`
22
+ - cosine sim embedding (E ya normalizado)
23
+ - incluye también: msg anterior sí o sí
24
+ """
25
+ if i <= 0:
26
+ return []
27
+
28
+ start = max(0, i - window)
29
+ cand_ids = list(range(start, i))
30
+
31
+ sims = cosine_sim_matrix(E[cand_ids], E[i])
32
+ order = np.argsort(-sims)
33
+
34
+ top = [(cand_ids[idx], float(sims[idx])) for idx in order[:top_k]]
35
+
36
+ # asegurar que el mensaje inmediatamente anterior esté
37
+ prev = i - 1
38
+ if prev not in [c for c, _ in top]:
39
+ prev_sim = float(cosine_sim_matrix(E[[prev]], E[i])[0])
40
+ top.append((prev, prev_sim))
41
+ top.sort(key=lambda x: x[1], reverse=True)
42
+
43
+ return top[:top_k]
44
+
45
+
46
+ def assign_reply_to_offline(
47
+ interactions: List[Interaction],
48
+ E: np.ndarray,
49
+ ) -> None:
50
+ """
51
+ Asignación offline (sin LLM):
52
+ - elige top1 si sim >= reply_min_sim
53
+ - si no, reply_to=None
54
+ - confidence ~ sim
55
+ """
56
+ for i in range(len(interactions)):
57
+ if i == 0:
58
+ interactions[i].reply_to_id = None
59
+ interactions[i].confidence_reply = 1.0
60
+ continue
61
+
62
+ top = topk_reply_candidates(
63
+ i=i,
64
+ interactions=interactions,
65
+ E=E,
66
+ window=settings.reply_window,
67
+ top_k=settings.reply_top_k,
68
+ )
69
+
70
+ if not top:
71
+ interactions[i].reply_to_id = None
72
+ interactions[i].confidence_reply = 0.0
73
+ continue
74
+
75
+ best_id, best_sim = top[0]
76
+ # ambiguo: si top1-top2 pequeño, offline sigue top1 (LLM puede refinar si se activa)
77
+ if best_sim < settings.reply_min_sim:
78
+ interactions[i].reply_to_id = None
79
+ interactions[i].confidence_reply = float(best_sim)
80
+ else:
81
+ interactions[i].reply_to_id = int(best_id)
82
+ interactions[i].confidence_reply = float(best_sim)
83
+
84
+
85
+ def needs_llm_refine(i: int, top: List[Tuple[int, float]]) -> bool:
86
+ """
87
+ Decide si un caso merece LLM:
88
+ - baja similitud
89
+ - ambigüedad top1 vs top2
90
+ """
91
+ if not top:
92
+ return True
93
+ if top[0][1] < settings.reply_min_sim:
94
+ return True
95
+ if len(top) >= 2 and (top[0][1] - top[1][1]) < settings.reply_ambig_delta:
96
+ return True
97
+ return False
98
+
99
+
100
+ def refine_reply_to_with_openai(
101
+ interactions: List[Interaction],
102
+ E: np.ndarray,
103
+ model_name: str,
104
+ max_refines: Optional[int] = None,
105
+ ) -> None:
106
+ """
107
+ Refinamiento selectivo con OpenAI:
108
+ - Solo en mensajes “ambiguos”
109
+ - Top-K candidatos se pasan al modelo para elegir reply_to_id
110
+ """
111
+ from .openai_refiner import pick_reply_to_openai
112
+
113
+ if model_name == "none (offline)":
114
+ return
115
+
116
+ refined = 0
117
+ for i in range(1, len(interactions)):
118
+ top = topk_reply_candidates(
119
+ i=i,
120
+ interactions=interactions,
121
+ E=E,
122
+ window=settings.reply_window,
123
+ top_k=settings.reply_top_k,
124
+ )
125
+ if not needs_llm_refine(i, top):
126
+ continue
127
+
128
+ cand_pack = []
129
+ for cid, sim in top:
130
+ cand_pack.append(
131
+ {
132
+ "message_id": int(cid),
133
+ "speaker": interactions[cid].speaker,
134
+ "text": interactions[cid].text[:600],
135
+ "sim": float(sim),
136
+ }
137
+ )
138
+
139
+ picked = pick_reply_to_openai(
140
+ model_name=model_name,
141
+ target={
142
+ "message_id": int(i),
143
+ "speaker": interactions[i].speaker,
144
+ "text": interactions[i].text,
145
+ },
146
+ candidates=cand_pack,
147
+ )
148
+
149
+ interactions[i].reply_to_id = picked.get("reply_to_id", None)
150
+ interactions[i].confidence_reply = float(picked.get("confidence", interactions[i].confidence_reply or 0.0))
151
+
152
+ refined += 1
153
+ if max_refines and refined >= max_refines:
154
+ break
conversation_storyline/schemas.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Optional, Dict, Any
3
+
4
+
5
+ @dataclass
6
+ class Interaction:
7
+ message_id: int
8
+ speaker: str
9
+ text: str
10
+
11
+ # inferred
12
+ reply_to_id: Optional[int] = None
13
+ topic_id: Optional[int] = None
14
+ topic_label: Optional[str] = None
15
+
16
+ # optional metrics
17
+ sentiment: Optional[float] = None
18
+ confidence_reply: Optional[float] = None
19
+
20
+
21
+ @dataclass
22
+ class PipelineArtifacts:
23
+ out_dir: str
24
+ storyline_png: str
25
+ storyline_html: str
26
+ metrics_csv: str
27
+ interactions_jsonl: str
28
+ graph_json: str
29
+
30
+ summary_text: str
31
+ figs: Dict[str, Any]
conversation_storyline/topic_shifts.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Tuple, Dict
4
+ import numpy as np
5
+ import ruptures as rpt
6
+
7
+ from .config import settings
8
+ from .schemas import Interaction
9
+
10
+
11
+ def detect_topic_shifts_ruptures(E: np.ndarray) -> List[int]:
12
+ """
13
+ Change-point detection sobre embeddings (normalizados).
14
+ Devuelve índices de mensaje que INICIAN nuevo segmento (excluye 0).
15
+ """
16
+ n = E.shape[0]
17
+ if n < settings.topic_min_size * 2:
18
+ return []
19
+
20
+ # Distancia: usamos señal en d dims; rpt trabaja con (n, d)
21
+ algo = rpt.Pelt(model="rbf", min_size=settings.topic_min_size).fit(E)
22
+ # Penalización escala con log(n)
23
+ pen = settings.topic_penalty_scale * np.log(max(n, 2))
24
+ bkps = algo.predict(pen=pen) # devuelve endpoints (incluye n)
25
+
26
+ # convertimos endpoints a starts
27
+ starts = []
28
+ prev = 0
29
+ for end in bkps:
30
+ if end >= n:
31
+ break
32
+ if end - prev >= settings.topic_min_size:
33
+ starts.append(end)
34
+ prev = end
35
+
36
+ return starts
37
+
38
+
39
+ def detect_topic_shifts_ensemble(E: np.ndarray) -> List[int]:
40
+ """
41
+ Ensemble simple:
42
+ - ruptures
43
+ - caídas fuertes de similitud adyacente
44
+ """
45
+ shifts = set(detect_topic_shifts_ruptures(E))
46
+ n = E.shape[0]
47
+ if n >= 4:
48
+ adj = np.sum(E[1:] * E[:-1], axis=1) # cosine sim por estar normalizado
49
+ thr = float(np.percentile(adj, 10)) # peor 10% como candidatos
50
+ for i, s in enumerate(adj, start=1):
51
+ if s <= thr:
52
+ shifts.add(i)
53
+ return sorted(x for x in shifts if x > 0 and x < n)
54
+
55
+
56
+ def build_segments(n_messages: int, shift_starts: List[int]) -> List[Tuple[int, int, int]]:
57
+ """
58
+ Devuelve segmentos como (segment_id, start, end_exclusive)
59
+ """
60
+ starts = [0] + sorted(set([s for s in shift_starts if 0 < s < n_messages]))
61
+ ends = starts[1:] + [n_messages]
62
+ segs = [(sid, s, e) for sid, (s, e) in enumerate(zip(starts, ends))]
63
+ return segs
64
+
65
+
66
+ def assign_topics_basic(interactions: List[Interaction], segments: List[Tuple[int, int, int]]) -> None:
67
+ for seg_id, s, e in segments:
68
+ for i in range(s, e):
69
+ interactions[i].topic_id = seg_id
70
+ interactions[i].topic_label = f"Tema {seg_id}"
requirements.txt CHANGED
@@ -1,11 +1,13 @@
1
- gradio
2
- matplotlib
3
- scipy
4
- numpy
5
- pydantic
6
- networkx
7
- openai
8
- pandas
9
- plotly
10
- scikit-learn
11
- ruptures
 
 
 
1
+ gradio==4.44.1
2
+ numpy==1.26.4
3
+ pandas==2.2.2
4
+ matplotlib==3.9.0
5
+ plotly==5.22.0
6
+ networkx==3.3
7
+ ruptures==1.1.9
8
+ ortools==9.10.4067
9
+ scikit-learn==1.5.1
10
+ sentence-transformers==3.0.1
11
+
12
+ # Opcional (solo si quieres refinamiento LLM):
13
+ openai==1.40.8