Spaces:

Merlintxu
/

conversation

Runtime error

App Files Files Community

Merlintxu commited on Jan 8

Commit

c119a77

verified ·

1 Parent(s): ba9b296

Upload 23 files

Browse files

Files changed (23) hide show

app.py +107 -296
conversation_storyline/__init__.py +12 -0
conversation_storyline/__pycache__/__init__.cpython-313.pyc +0 -0
conversation_storyline/__pycache__/config.cpython-313.pyc +0 -0
conversation_storyline/__pycache__/embeddings.cpython-313.pyc +0 -0
conversation_storyline/__pycache__/io.cpython-313.pyc +0 -0
conversation_storyline/__pycache__/pipeline.cpython-313.pyc +0 -0
conversation_storyline/__pycache__/reply_to.cpython-313.pyc +0 -0
conversation_storyline/__pycache__/schemas.cpython-313.pyc +0 -0
conversation_storyline/__pycache__/topic_shifts.cpython-313.pyc +0 -0
conversation_storyline/config.py +33 -0
conversation_storyline/embeddings.py +53 -0
conversation_storyline/io.py +49 -0
conversation_storyline/layout_heuristic.py +29 -0
conversation_storyline/layout_ilp.py +170 -0
conversation_storyline/openai_refiner.py +82 -0
conversation_storyline/pipeline.py +187 -0
conversation_storyline/plots.py +83 -0
conversation_storyline/render.py +45 -0
conversation_storyline/reply_to.py +154 -0
conversation_storyline/schemas.py +31 -0
conversation_storyline/topic_shifts.py +70 -0
requirements.txt +13 -11

app.py CHANGED Viewed

@@ -1,296 +1,107 @@
-import os
-import re
-import json
-import tempfile
-from dataclasses import dataclass
-from typing import List, Tuple, Dict, Optional
-import gradio as gr
-import numpy as np
-import pandas as pd
-import plotly.express as px
-import plotly.graph_objects as go
-import matplotlib.pyplot as plt
-# ----------------------------
-# Helpers: parsing transcript
-# ----------------------------
-SPEAKER_LINE = re.compile(r"^\s*([^:]{1,64})\s*:\s*(.+)\s*$")
-@dataclass
-class Msg:
-    id: int
-    speaker: str
-    content: str
-def parse_transcript(text: str) -> List[Msg]:
-    """
-    Very robust parser:
-    - If lines look like "Speaker: message" => uses speaker
-    - Otherwise speaker="Unknown"
-    - Skips empty lines
-    """
-    if not text:
-        return []
-    msgs: List[Msg] = []
-    i = 1
-    for raw in text.splitlines():
-        line = raw.strip()
-        if not line:
-            continue
-        m = SPEAKER_LINE.match(line)
-        if m:
-            speaker = m.group(1).strip()
-            content = m.group(2).strip()
-        else:
-            speaker = "Unknown"
-            content = line
-        msgs.append(Msg(id=i, speaker=speaker, content=content))
-        i += 1
-    return msgs
-# ----------------------------
-# Your analysis hook
-# ----------------------------
-def run_analysis(transcript_text: str, model_name: str):
-    """
-    Replace the body of this function with YOUR real pipeline call.
-    It must return:
-      interactions: list[dict] with keys at least:
-         id, speaker, reply_to_id (or None), topic_label, is_topic_shift, sentiment_score
-      metrics_df: DataFrame with columns: t_start, speaker, centrality (optional), community (optional)
-      layout_df: DataFrame with columns: t_start, speaker, y or y_smooth
-      topic_shifts: list[int] message IDs (ensemble)
-      summary: str
-    """
-    # ---- Example stub using your existing output files (if you have them) ----
-    # If your pipeline already returns these objects, CALL IT here and return them.
-    # For now, we build a trivial "interactions" baseline so the UI works.
-    msgs = parse_transcript(transcript_text)
-    interactions = []
-    for m in msgs:
-        interactions.append(
-            dict(
-                id=m.id,
-                speaker=m.speaker,
-                reply_to_id=(m.id - 1 if m.id > 1 else None),
-                topic_label="General",
-                is_topic_shift=False,
-                sentiment_score=0.0,
-            )
-        )
-    # Fake metrics/layout (minimal) so plots render; replace with real metrics/layout from your pipeline.
-    speakers = sorted({it["speaker"] for it in interactions})
-    t_starts = list(range(1, max(2, len(interactions)), 5))
-    rows = []
-    for t in t_starts:
-        for s in speakers:
-            rows.append({"t_start": t, "speaker": s, "centrality": 0.0, "community": 0})
-    metrics_df = pd.DataFrame(rows)
-    # layout: put speakers on fixed lanes
-    lay_rows = []
-    for t in t_starts:
-        for i, s in enumerate(speakers):
-            lay_rows.append({"t_start": t, "speaker": s, "y": float(i), "y_smooth": float(i)})
-    layout_df = pd.DataFrame(lay_rows)
-    topic_shifts = []
-    summary = f"Mensajes: {len(interactions)} | Participantes: {len(speakers)} | Modelo: {model_name}"
-    return interactions, metrics_df, layout_df, topic_shifts, summary
-# ----------------------------
-# Plot builders
-# ----------------------------
-def render_storyline_png(layout_df: pd.DataFrame, topic_shifts: List[int], out_path: str) -> str:
-    """
-    Minimal storyline render:
-    - x axis: t_start
-    - y axis: lane position
-    """
-    if layout_df is None or layout_df.empty:
-        # Create an empty placeholder image
-        fig, ax = plt.subplots(figsize=(10, 3))
-        ax.text(0.5, 0.5, "No layout data", ha="center", va="center")
-        ax.axis("off")
-        fig.savefig(out_path, dpi=160, bbox_inches="tight")
-        plt.close(fig)
-        return out_path
-    df = layout_df.copy()
-    if "y_smooth" not in df.columns:
-        df["y_smooth"] = df["y"]
-    fig, ax = plt.subplots(figsize=(14, 6))
-    for sp, g in df.groupby("speaker", sort=False):
-        g = g.sort_values("t_start")
-        ax.plot(g["t_start"], g["y_smooth"], linewidth=1.5)
-        # label at start
-        ax.text(g["t_start"].iloc[0], g["y_smooth"].iloc[0], str(sp), fontsize=9)
-    # topic shifts as vertical lines (approx: by message id; if your x is t_start, adapt mapping)
-    for x in topic_shifts:
-        ax.axvline(x=x, linewidth=1.0, linestyle="--", alpha=0.6)
-    ax.set_title("Storyline")
-    ax.set_xlabel("t")
-    ax.set_yticks([])
-    fig.tight_layout()
-    fig.savefig(out_path, dpi=160, bbox_inches="tight")
-    plt.close(fig)
-    return out_path
-def plot_sentiment(interactions: List[dict]):
-    df = pd.DataFrame(interactions)
-    if df.empty:
-        return go.Figure()
-    return px.scatter(df, x="id", y="sentiment_score", color="speaker", title="Sentiment timeline")
-def plot_reply_distance_hist(interactions: List[dict]):
-    df = pd.DataFrame(interactions)
-    if df.empty:
-        return go.Figure()
-    d = df.dropna(subset=["reply_to_id"]).assign(dist=lambda x: x["id"] - x["reply_to_id"])
-    return px.histogram(d, x="dist", nbins=50, title="Reply distance histogram")
-def plot_bump(metrics_df: pd.DataFrame, top_n: int = 20):
-    if metrics_df is None or metrics_df.empty:
-        return go.Figure()
-    avg = metrics_df.groupby("speaker")["centrality"].mean().sort_values(ascending=False).head(top_n)
-    df = metrics_df[metrics_df["speaker"].isin(avg.index)].copy()
-    df["rank"] = df.groupby("t_start")["centrality"].rank(ascending=False, method="dense")
-    fig = px.line(df, x="t_start", y="rank", color="speaker", title=f"Bump chart (Top {top_n})")
-    fig.update_yaxes(autorange="reversed")
-    return fig
-def plot_heatmap(metrics_df: pd.DataFrame, top_n: int = 20):
-    if metrics_df is None or metrics_df.empty:
-        return go.Figure()
-    avg = metrics_df.groupby("speaker")["centrality"].mean().sort_values(ascending=False).head(top_n)
-    df = metrics_df[metrics_df["speaker"].isin(avg.index)].copy()
-    piv = df.pivot_table(index="speaker", columns="t_start", values="centrality", aggfunc="mean").fillna(0)
-    return px.imshow(piv, aspect="auto", title=f"Centrality heatmap (Top {top_n})")
-def plot_topic_sankey(interactions: List[dict]):
-    df = pd.DataFrame(interactions)
-    if df.empty or "topic_label" not in df.columns:
-        return go.Figure()
-    topics = df["topic_label"].astype(str).tolist()
-    links: Dict[Tuple[str, str], int] = {}
-    for a, b in zip(topics[:-1], topics[1:]):
-        if a == b:
-            continue
-        links[(a, b)] = links.get((a, b), 0) + 1
-    if not links:
-        return go.Figure()
-    nodes = sorted(set([t for ab in links.keys() for t in ab]))
-    idx = {n: i for i, n in enumerate(nodes)}
-    fig = go.Figure(
-        data=[
-            go.Sankey(
-                node=dict(label=nodes),
-                link=dict(
-                    source=[idx[a] for (a, b) in links.keys()],
-                    target=[idx[b] for (a, b) in links.keys()],
-                    value=list(links.values()),
-                ),
-            )
-        ]
-    )
-    fig.update_layout(title="Topic transitions (Sankey)")
-    return fig
-# ----------------------------
-# Gradio callback (NO global state)
-# ----------------------------
-def process_transcript(transcript_text: str, model_name: str):
-    # Run your pipeline (replace run_analysis internals with your real pipeline)
-    interactions, metrics_df, layout_df, topic_shifts, summary = run_analysis(transcript_text, model_name)
-    # ✅ This is where your old code crashed:
-    #    all_interactions was not defined. Here we use `interactions`.
-    active_participants = sorted({n["speaker"] for n in interactions}) if interactions else []
-    # Build image
-    tmpdir = tempfile.mkdtemp(prefix="storyline_")
-    img_path = os.path.join(tmpdir, "storyline.png")
-    render_storyline_png(layout_df, topic_shifts, img_path)
-    # Plots
-    sentiment_fig = plot_sentiment(interactions)
-    bump_fig = plot_bump(metrics_df, top_n=25)
-    heatmap_fig = plot_heatmap(metrics_df, top_n=25)
-    hist_fig = plot_reply_distance_hist(interactions)
-    sankey_fig = plot_topic_sankey(interactions)
-    # Summary text
-    summary_full = summary + f"\nParticipantes activos: {', '.join(active_participants[:50])}" + (
-        " ..." if len(active_participants) > 50 else ""
-    )
-    return img_path, summary_full, sentiment_fig, bump_fig, heatmap_fig, hist_fig, sankey_fig
-# ------------------------------------------------------------------
-# Theme + UI (ONLY ONCE)
-# ------------------------------------------------------------------
-theme = gr.themes.Soft(primary_hue="blue").set(
-    body_background_fill="*neutral_50",
-    block_background_fill="*neutral_100",
-)
-with gr.Blocks(
-    title="Conversation Storyline Visualizer – Advanced",
-    theme=theme,  # ✅ theme belongs here (more compatible than demo.launch(theme=...))
-) as demo:
-    gr.Markdown("# Visualización Narrativa Avanzada de Conversaciones")
-    gr.Markdown("Soporte para conversaciones largas con chunking + refinamiento reply_to y topic shifts ensemble.")
-    with gr.Row():
-        model_selector = gr.Dropdown(
-            choices=["gpt-4o-2024-08-06", "gpt-4o-mini-2024-07-18"],
-            value="gpt-4o-2024-08-06",
-            label="Modelo OpenAI",
-        )
-    input_text = gr.Textbox(label="Transcripción", lines=20)
-    btn = gr.Button("Generar Visualizaciones", variant="primary")
-    with gr.Tabs():
-        with gr.Tab("Storyline Principal"):
-            main_img = gr.Image(label="Storyline (PNG)")
-            summary_box = gr.Textbox(label="Resumen", lines=6)
-        with gr.Tab("Análisis Detallado"):
-            sentiment_plot = gr.Plot(label="Sentimiento")
-            bump_plot = gr.Plot(label="Ranking (Bump)")
-            heatmap_plot = gr.Plot(label="Heatmap centralidad")
-            hist_plot = gr.Plot(label="Hist reply-distance")
-            sankey_plot = gr.Plot(label="Sankey topics")
-    btn.click(
-        fn=process_transcript,
-        inputs=[input_text, model_selector],
-        outputs=[main_img, summary_box, sentiment_plot, bump_plot, heatmap_plot, hist_plot, sankey_plot],
-    )
-demo.launch()

+import os
+import tempfile
+from pathlib import Path
+import gradio as gr
+from conversation_storyline.pipeline import run_pipeline_from_text
+THEME = gr.themes.Soft(primary_hue="blue").set(
+    body_background_fill="*neutral_50",
+    block_background_fill="*neutral_100",
+)
+TITLE = "Conversation Storyline Visualizer – v4"
+def process_transcript(transcript: str, model_selector: str):
+    """
+    Entrada: texto pegado (transcripción)
+    Salida: storyline.png + resumen + figs plotly
+    """
+    if not transcript or not transcript.strip():
+        raise gr.Error("Pega una transcripción en el cuadro de texto.")
+    outdir = Path(tempfile.mkdtemp(prefix="storyline_v4_"))
+    outputs = run_pipeline_from_text(
+        transcript_text=transcript,
+        out_dir=outdir,
+        openai_model=model_selector,
+    )
+    return (
+        str(outputs["storyline_png"]),
+        outputs["summary_text"],
+        outputs["fig_sentiment"],
+        outputs["fig_bump"],
+        outputs["fig_heatmap"],
+        outputs["fig_hist_reply_dist"],
+        outputs["fig_sankey"],
+        outputs["storyline_html"],
+        outputs["metrics_csv"],
+        outputs["interactions_jsonl"],
+        outputs["graph_json"],
+    )
+with gr.Blocks(title=TITLE, theme=THEME) as demo:
+    gr.Markdown("# Visualización Narrativa Avanzada de Conversaciones (v4)")
+    gr.Markdown(
+        "- Pega una transcripción tipo `Speaker A: ...`\n"
+        "- Soporta conversaciones largas (chunking)\n"
+        "- Reply-to Top-K embeddings + topic shifts ruptures + layout OR-Tools\n"
+    )
+    with gr.Row():
+        model_selector = gr.Dropdown(
+            choices=[
+                "gpt-4o-2024-08-06",
+                "gpt-4o-mini-2024-07-18",
+                "none (offline)",
+            ],
+            value="none (offline)",
+            label="Modelo (opcional; si hay OPENAI_API_KEY)",
+        )
+    input_text = gr.Textbox(label="Transcripción", lines=20, placeholder="Pega aquí la transcripción...")
+    btn = gr.Button("Generar Visualizaciones", variant="primary")
+    with gr.Tabs():
+        with gr.Tab("Storyline Principal"):
+            main_img = gr.Image(label="Storyline (PNG)")
+            summary_box = gr.Textbox(label="Resumen", lines=10)
+            storyline_html = gr.HTML(label="Storyline (HTML embebido)")
+        with gr.Tab("Análisis Detallado"):
+            sentiment_plot = gr.Plot(label="Sentiment (si aplica)")
+            bump_plot = gr.Plot(label="Bump actividad por segmento")
+            heatmap_plot = gr.Plot(label="Heatmap interacciones")
+            hist_plot = gr.Plot(label="Histograma distancia reply_to")
+            sankey_plot = gr.Plot(label="Sankey Speaker → Topic")
+        with gr.Tab("Descargas"):
+            metrics_csv = gr.File(label="metrics.csv")
+            interactions_jsonl = gr.File(label="interactions.jsonl")
+            graph_json = gr.File(label="graph.json")
+    btn.click(
+        fn=process_transcript,
+        inputs=[input_text, model_selector],
+        outputs=[
+            main_img,
+            summary_box,
+            sentiment_plot,
+            bump_plot,
+            heatmap_plot,
+            hist_plot,
+            sankey_plot,
+            storyline_html,
+            metrics_csv,
+            interactions_jsonl,
+            graph_json,
+        ],
+    )
+if __name__ == "__main__":
+    # HF Spaces
+    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))

conversation_storyline/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+__all__ = [
+    "pipeline",
+    "io",
+    "schemas",
+    "embeddings",
+    "reply_to",
+    "topic_shifts",
+    "layout_ilp",
+    "layout_heuristic",
+    "render",
+    "plots",
+]

conversation_storyline/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (314 Bytes). View file

conversation_storyline/__pycache__/config.cpython-313.pyc ADDED Viewed

Binary file (1.49 kB). View file

conversation_storyline/__pycache__/embeddings.cpython-313.pyc ADDED Viewed

Binary file (3.08 kB). View file

conversation_storyline/__pycache__/io.cpython-313.pyc ADDED Viewed

Binary file (2.04 kB). View file

conversation_storyline/__pycache__/pipeline.cpython-313.pyc ADDED Viewed

Binary file (8.41 kB). View file

conversation_storyline/__pycache__/reply_to.cpython-313.pyc ADDED Viewed

Binary file (5.42 kB). View file

conversation_storyline/__pycache__/schemas.cpython-313.pyc ADDED Viewed

Binary file (1.42 kB). View file

conversation_storyline/__pycache__/topic_shifts.cpython-313.pyc ADDED Viewed

Binary file (3.76 kB). View file

conversation_storyline/config.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from dataclasses import dataclass
+import os
+@dataclass(frozen=True)
+class Settings:
+    # Parsing / ingest
+    max_speaker_label_len: int = 64
+    # Reply-to
+    reply_window: int = 60          # candidatos previos a considerar
+    reply_top_k: int = 10           # top-k por embeddings
+    reply_min_sim: float = 0.25     # si top1 < umbral -> puede quedar None (offline)
+    reply_ambig_delta: float = 0.03 # si top1-top2 < delta -> candidato a LLM (si hay)
+    # Topic shifts (ruptures)
+    topic_min_size: int = 8
+    topic_penalty_scale: float = 2.4  # más alto -> menos cortes
+    # Layout (OR-Tools)
+    ilp_time_limit_s: float = 6.0
+    ilp_max_participants: int = 28     # si más, fallback
+    ilp_max_segments: int = 120        # si más, fallback
+    # Rendering
+    storyline_dpi: int = 180
+settings = Settings()
+def has_openai_key() -> bool:
+    return bool(os.getenv("OPENAI_API_KEY", "").strip())

conversation_storyline/embeddings.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Optional
+import numpy as np
+@dataclass
+class Embedder:
+    """
+    Embeddings CPU con Sentence-Transformers.
+    - Normaliza para cosine-sim rápida.
+    - Fallback simple TF-IDF si ST no está disponible.
+    """
+    model_name: str = "intfloat/multilingual-e5-small"
+    _st = None
+    _tfidf = None
+    _tfidf_vectorizer = None
+    def _load_st(self):
+        if self._st is None:
+            from sentence_transformers import SentenceTransformer
+            self._st = SentenceTransformer(self.model_name)
+    def encode(self, texts: List[str]) -> np.ndarray:
+        texts = [t or "" for t in texts]
+        try:
+            self._load_st()
+            # e5: mejor con "passage: " / "query: " pero aquí vale "passage:"
+            inp = [("passage: " + t) for t in texts]
+            X = np.array(self._st.encode(inp, normalize_embeddings=True, show_progress_bar=False), dtype=np.float32)
+            return X
+        except Exception:
+            # TF-IDF fallback (no semántico perfecto, pero funciona offline “siempre”)
+            from sklearn.feature_extraction.text import TfidfVectorizer
+            if self._tfidf_vectorizer is None:
+                self._tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
+                X = self._tfidf_vectorizer.fit_transform(texts).astype(np.float32)
+            else:
+                X = self._tfidf_vectorizer.transform(texts).astype(np.float32)
+            # normalizar filas
+            X = X.toarray()
+            norms = np.linalg.norm(X, axis=1, keepdims=True) + 1e-9
+            return X / norms
+def cosine_sim_matrix(A: np.ndarray, b: np.ndarray) -> np.ndarray:
+    """
+    A: (n, d) normalized
+    b: (d,) normalized
+    return: (n,)
+    """
+    return (A @ b).astype(np.float32)

conversation_storyline/io.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import re
+from typing import List
+from .schemas import Interaction
+SPEAKER_PATTERNS = [
+    # Speaker A: ...
+    re.compile(r"^(?P<speaker>Speaker\s+[A-Za-z0-9_\- ]{1,64})\s*:\s*(?P<text>.+)\s*$"),
+    # A: ...
+    re.compile(r"^(?P<speaker>[A-Za-zÁÉÍÓÚÜÑáéíóúüñ0-9_\- ]{1,32})\s*:\s*(?P<text>.+)\s*$"),
+]
+def parse_transcript(text: str) -> List[Interaction]:
+    """
+    Robusto para texto pegado:
+    - Cada línea que matchee "SPEAKER: ..." crea nuevo mensaje.
+    - Líneas sin speaker se anexan al texto del último mensaje (continuación).
+    """
+    lines = [l.rstrip() for l in (text or "").splitlines()]
+    interactions: List[Interaction] = []
+    cur = None
+    for raw in lines:
+        line = raw.strip()
+        if not line:
+            continue
+        matched = None
+        for pat in SPEAKER_PATTERNS:
+            m = pat.match(line)
+            if m:
+                matched = m
+                break
+        if matched:
+            speaker = matched.group("speaker").strip()
+            msg = matched.group("text").strip()
+            cur = Interaction(message_id=len(interactions), speaker=speaker, text=msg)
+            interactions.append(cur)
+        else:
+            # continuation line
+            if cur is None:
+                cur = Interaction(message_id=0, speaker="Unknown", text=line)
+                interactions.append(cur)
+            else:
+                cur.text = (cur.text + " " + line).strip()
+    return interactions

conversation_storyline/layout_heuristic.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from __future__ import annotations
+from typing import Dict, List, Tuple
+import numpy as np
+import pandas as pd
+def compute_layout_heuristic(metrics: pd.DataFrame) -> pd.DataFrame:
+    """
+    Fallback rápido:
+    - ordena speakers por actividad total
+    - asigna y fijo por speaker
+    """
+    if metrics.empty:
+        return metrics
+    speakers = (
+        metrics.groupby("speaker")["message_id"]
+        .count()
+        .sort_values(ascending=False)
+        .index
+        .tolist()
+    )
+    y_map = {s: i for i, s in enumerate(speakers)}
+    out = metrics.copy()
+    out["y"] = out["speaker"].map(y_map).astype(float)
+    out["y_smooth"] = out["y"]
+    out["line_width"] = 1.0
+    return out

conversation_storyline/layout_ilp.py ADDED Viewed

	@@ -0,0 +1,170 @@

+from __future__ import annotations
+from typing import Dict, List, Tuple, Set
+import numpy as np
+import pandas as pd
+from ortools.sat.python import cp_model
+from .config import settings
+def compute_storyline_layout_ilp(
+    metrics: pd.DataFrame,
+    segments: List[Tuple[int, int, int]],
+    interactions_edges: Dict[Tuple[str, str], float],
+) -> pd.DataFrame:
+    """
+    Layout por segmentos con CP-SAT minimizando:
+    - inversiones de orden entre segmentos (proxy de cruces)
+    - wiggle (|y_t - y_{t-1}|)
+    - distancia entre speakers que interactúan en el mismo segmento
+    metrics: filas por mensaje con speaker, message_id, topic_id, etc.
+    segments: (seg_id, start, end)
+    interactions_edges: pesos globales (speaker_a, speaker_b) -> weight
+    """
+    if metrics.empty:
+        return metrics.assign(y=np.nan, y_smooth=np.nan, line_width=1.0)
+    speakers = sorted(metrics["speaker"].unique().tolist())
+    P = len(speakers)
+    S = len(segments)
+    if P > settings.ilp_max_participants or S > settings.ilp_max_segments:
+        # demasiado grande: lo decide pipeline con fallback; aquí por seguridad
+        return metrics.assign(y=np.nan, y_smooth=np.nan, line_width=1.0)
+    sp_idx = {s: i for i, s in enumerate(speakers)}
+    max_lanes = max(
+        metrics.groupby("topic_id")["speaker"].nunique().max(),
+        1
+    )
+    # active[p][s] = bool
+    active = [[False] * S for _ in range(P)]
+    for seg_id, start, end in segments:
+        seg_speakers = set(metrics[(metrics["message_id"] >= start) & (metrics["message_id"] < end)]["speaker"])
+        for sp in seg_speakers:
+            active[sp_idx[sp]][seg_id] = True
+    model = cp_model.CpModel()
+    # y[p,s] only for active; store as dict
+    y = {}
+    for p in range(P):
+        for s in range(S):
+            if active[p][s]:
+                y[(p, s)] = model.NewIntVar(0, max_lanes - 1, f"y_p{p}_s{s}")
+    # AllDifferent por segmento (solo activos)
+    for s in range(S):
+        vars_s = [y[(p, s)] for p in range(P) if (p, s) in y]
+        if len(vars_s) >= 2:
+            model.AddAllDifferent(vars_s)
+    # wiggle: |y(p,s)-y(p,s-1)|
+    wiggle_terms = []
+    for p in range(P):
+        for s in range(1, S):
+            if (p, s) in y and (p, s - 1) in y:
+                d = model.NewIntVar(0, max_lanes, f"wiggle_p{p}_s{s}")
+                model.AddAbsEquality(d, y[(p, s)] - y[(p, s - 1)])
+                wiggle_terms.append(d)
+    # pair ordering vars above[p,q,s] for pairs that are active in segment s
+    # limit pairs: only those with interaction weight > 0
+    interesting_pairs: Set[Tuple[int, int]] = set()
+    for (a, b), w in interactions_edges.items():
+        if w <= 0:
+            continue
+        pa, pb = sp_idx.get(a), sp_idx.get(b)
+        if pa is None or pb is None or pa == pb:
+            continue
+        if pa < pb:
+            interesting_pairs.add((pa, pb))
+        else:
+            interesting_pairs.add((pb, pa))
+    above = {}
+    for (p, q) in interesting_pairs:
+        for s in range(S):
+            if (p, s) in y and (q, s) in y:
+                b = model.NewBoolVar(f"above_p{p}_q{q}_s{s}")  # p above q
+                above[(p, q, s)] = b
+                # b -> y[p] +1 <= y[q]
+                model.Add(y[(p, s)] + 1 <= y[(q, s)]).OnlyEnforceIf(b)
+                # not b -> y[q] +1 <= y[p]
+                model.Add(y[(q, s)] + 1 <= y[(p, s)]).OnlyEnforceIf(b.Not())
+    # inversions: inv = |above_s - above_{s-1}|  (penaliza cambios de orden)
+    inv_terms = []
+    for (p, q) in interesting_pairs:
+        for s in range(1, S):
+            k1 = (p, q, s)
+            k0 = (p, q, s - 1)
+            if k1 in above and k0 in above:
+                inv = model.NewBoolVar(f"inv_p{p}_q{q}_s{s}")
+                # inv == abs(b1-b0)
+                b1, b0 = above[k1], above[k0]
+                # linearización: inv >= b1-b0 ; inv >= b0-b1 ; inv <= b1+b0 ; inv <= 2-(b1+b0)
+                model.Add(inv >= b1 - b0)
+                model.Add(inv >= b0 - b1)
+                model.Add(inv <= b1 + b0)
+                model.Add(inv <= 2 - (b1 + b0))
+                inv_terms.append(inv)
+    # closeness: si interactúan fuerte, penaliza distancia en segmentos donde ambos están activos
+    close_terms = []
+    close_weights = []
+    for (a, b), w in interactions_edges.items():
+        if w <= 0:
+            continue
+        pa, pb = sp_idx.get(a), sp_idx.get(b)
+        if pa is None or pb is None or pa == pb:
+            continue
+        for s in range(S):
+            if (pa, s) in y and (pb, s) in y:
+                d = model.NewIntVar(0, max_lanes, f"dist_{pa}_{pb}_s{s}")
+                model.AddAbsEquality(d, y[(pa, s)] - y[(pb, s)])
+                close_terms.append(d)
+                close_weights.append(float(w))
+    # objective
+    obj = []
+    # wiggle weight
+    obj += [2 * t for t in wiggle_terms]
+    # inversions weight
+    obj += [4 * t for t in inv_terms]
+    # closeness weighted (scale)
+    for t, w in zip(close_terms, close_weights):
+        # CP-SAT necesita ints; escalamos peso
+        obj.append(int(min(20.0, 1.0 + w)) * t)
+    model.Minimize(sum(obj))
+    solver = cp_model.CpSolver()
+    solver.parameters.max_time_in_seconds = float(settings.ilp_time_limit_s)
+    solver.parameters.num_search_workers = 8
+    status = solver.Solve(model)
+    if status not in (cp_model.OPTIMAL, cp_model.FEASIBLE):
+        return metrics.assign(y=np.nan, y_smooth=np.nan, line_width=1.0)
+    # construir y por speaker y segmento
+    y_seg = { (speakers[p], s): float(solver.Value(y[(p, s)])) for (p, s) in y.keys() }
+    # map y a cada mensaje por su segmento
+    out = metrics.copy()
+    out["y"] = np.nan
+    for seg_id, start, end in segments:
+        mask = (out["message_id"] >= start) & (out["message_id"] < end)
+        for sp in speakers:
+            m2 = mask & (out["speaker"] == sp)
+            if m2.any() and (sp, seg_id) in y_seg:
+                out.loc[m2, "y"] = y_seg[(sp, seg_id)]
+    out["y_smooth"] = out["y"]  # podrías suavizar aquí si quieres
+    out["line_width"] = 1.0
+    return out

conversation_storyline/openai_refiner.py ADDED Viewed

	@@ -0,0 +1,82 @@

+from __future__ import annotations
+import json
+import os
+from typing import Dict, Any, List, Optional
+def _client():
+    from openai import OpenAI
+    return OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+def pick_reply_to_openai(
+    model_name: str,
+    target: Dict[str, Any],
+    candidates: List[Dict[str, Any]],
+) -> Dict[str, Any]:
+    """
+    Selecciona reply_to_id solo entre candidatos top-K.
+    Devuelve JSON con reply_to_id|None + confidence.
+    """
+    sys = (
+        "Eres un clasificador preciso de 'reply_to' en conversaciones.\n"
+        "Debes elegir el message_id al que responde el target, SOLO entre los candidatos.\n"
+        "Si ninguno encaja, devuelve reply_to_id=null.\n"
+        "Devuelve SOLO JSON válido."
+    )
+    user = {
+        "task": "Pick reply_to_id for target among candidates.",
+        "target": target,
+        "candidates": candidates,
+        "output_schema": {"reply_to_id": "int|null", "confidence": "0..1"},
+    }
+    resp = _client().chat.completions.create(
+        model=model_name,
+        messages=[
+            {"role": "system", "content": sys},
+            {"role": "user", "content": json.dumps(user, ensure_ascii=False)},
+        ],
+        temperature=0.0,
+    )
+    txt = resp.choices[0].message.content.strip()
+    try:
+        data = json.loads(txt)
+        if "reply_to_id" not in data:
+            return {"reply_to_id": None, "confidence": 0.2}
+        return data
+    except Exception:
+        return {"reply_to_id": None, "confidence": 0.2}
+def label_topics_openai(
+    model_name: str,
+    segments: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """
+    Etiqueta temas por segmento:
+    input: [{segment_id, sample_messages:[...]}]
+    output: [{segment_id, topic_label}]
+    """
+    sys = (
+        "Eres un analista de conversaciones. Etiqueta cada segmento con un topic_label corto (3-6 palabras).\n"
+        "Devuelve SOLO JSON válido: lista de objetos {segment_id:int, topic_label:str}."
+    )
+    resp = _client().chat.completions.create(
+        model=model_name,
+        messages=[
+            {"role": "system", "content": sys},
+            {"role": "user", "content": json.dumps(segments, ensure_ascii=False)},
+        ],
+        temperature=0.2,
+    )
+    txt = resp.choices[0].message.content.strip()
+    try:
+        return json.loads(txt)
+    except Exception:
+        # fallback: sin etiquetas
+        return [{"segment_id": s["segment_id"], "topic_label": f"Tema {s['segment_id']}"} for s in segments]

conversation_storyline/pipeline.py ADDED Viewed

	@@ -0,0 +1,187 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Dict, Any, Tuple
+import pandas as pd
+import numpy as np
+from .config import settings, has_openai_key
+from .io import parse_transcript
+from .embeddings import Embedder
+from .reply_to import assign_reply_to_offline, refine_reply_to_with_openai
+from .topic_shifts import detect_topic_shifts_ensemble, build_segments, assign_topics_basic
+from .layout_ilp import compute_storyline_layout_ilp
+from .layout_heuristic import compute_layout_heuristic
+from .render import render_storyline_png
+from .plots import (
+    plot_reply_distance_hist,
+    plot_interaction_heatmap,
+    plot_bump_activity,
+    plot_sankey_speaker_to_topic,
+    plot_sentiment_placeholder,
+)
+def _build_metrics(interactions) -> pd.DataFrame:
+    rows = []
+    for it in interactions:
+        rows.append(
+            {
+                "message_id": it.message_id,
+                "speaker": it.speaker,
+                "text": it.text,
+                "reply_to_id": it.reply_to_id,
+                "topic_id": it.topic_id,
+                "topic_label": it.topic_label,
+                "sentiment": it.sentiment,
+                "confidence_reply": it.confidence_reply,
+            }
+        )
+    df = pd.DataFrame(rows)
+    # reply distance
+    df["reply_distance"] = df.apply(
+        lambda r: (r["message_id"] - r["reply_to_id"]) if pd.notna(r["reply_to_id"]) else np.nan,
+        axis=1,
+    )
+    return df
+def _interaction_matrix(metrics: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[Tuple[str, str], float]]:
+    """
+    Matriz from->to por reply_to.
+    También devuelve edges globales (speaker_a,speaker_b)->weight para layout.
+    """
+    sp = sorted(metrics["speaker"].unique().tolist())
+    mat = pd.DataFrame(0, index=sp, columns=sp, dtype=int)
+    edges = {}
+    for _, r in metrics.iterrows():
+        if pd.isna(r["reply_to_id"]):
+            continue
+        rid = int(r["reply_to_id"])
+        if rid < 0 or rid >= len(metrics):
+            continue
+        src = r["speaker"]
+        dst = metrics.loc[rid, "speaker"]
+        mat.loc[src, dst] += 1
+        key = (src, dst)
+        edges[key] = edges.get(key, 0.0) + 1.0
+        # simetrizar un poco para “cercanía” (no dirección)
+        key2 = (dst, src)
+        edges[key2] = edges.get(key2, 0.0) + 0.6
+    return mat, edges
+def run_pipeline_from_text(
+    transcript_text: str,
+    out_dir: Path,
+    openai_model: str = "none (offline)",
+) -> Dict[str, Any]:
+    out_dir = Path(out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    # 1) parse
+    interactions = parse_transcript(transcript_text)
+    if not interactions:
+        raise ValueError("No se detectaron líneas tipo 'Speaker: texto'.")
+    # 2) embeddings
+    embedder = Embedder()
+    E = embedder.encode([it.text for it in interactions])
+    # 3) reply_to offline + refine opcional
+    assign_reply_to_offline(interactions, E)
+    if openai_model != "none (offline)" and has_openai_key():
+        # Refinamiento selectivo (no refina todo)
+        refine_reply_to_with_openai(interactions, E, model_name=openai_model, max_refines=None)
+    # 4) topic shifts (ensemble)
+    shifts = detect_topic_shifts_ensemble(E)
+    segments = build_segments(len(interactions), shifts)
+    assign_topics_basic(interactions, segments)
+    # 5) metrics
+    metrics = _build_metrics(interactions)
+    # 6) interacción matrix + edges
+    inter_mat, edges = _interaction_matrix(metrics)
+    # 7) layout: ILP si cabe, si no heurístico
+    metrics["topic_id"] = metrics["topic_id"].fillna(0).astype(int)
+    if (metrics["speaker"].nunique() <= settings.ilp_max_participants) and (len(segments) <= settings.ilp_max_segments):
+        laid = compute_storyline_layout_ilp(metrics, segments, edges)
+        if laid["y"].isna().all():
+            laid = compute_layout_heuristic(metrics)
+    else:
+        laid = compute_layout_heuristic(metrics)
+    # 8) render storyline
+    storyline_png = out_dir / "storyline.png"
+    render_storyline_png(laid, str(storyline_png), title="Dinámica Narrativa (Storyline)")
+    # 9) figs
+    fig_hist = plot_reply_distance_hist(metrics)
+    fig_heat = plot_interaction_heatmap(inter_mat)
+    fig_bump = plot_bump_activity(metrics)
+    fig_sankey = plot_sankey_speaker_to_topic(metrics)
+    fig_sent = plot_sentiment_placeholder(metrics)
+    # 10) export artifacts
+    metrics_csv = out_dir / "metrics.csv"
+    metrics.to_csv(metrics_csv, index=False, encoding="utf-8")
+    interactions_jsonl = out_dir / "interactions.jsonl"
+    with interactions_jsonl.open("w", encoding="utf-8") as f:
+        for it in interactions:
+            f.write(json.dumps(it.__dict__, ensure_ascii=False) + "\n")
+    graph_json = out_dir / "graph.json"
+    graph_json.write_text(
+        json.dumps(
+            {
+                "nodes": [{"id": int(r["message_id"]), "speaker": r["speaker"]} for _, r in metrics.iterrows()],
+                "links": [
+                    {"source": int(r["message_id"]), "target": int(r["reply_to_id"])}
+                    for _, r in metrics.iterrows()
+                    if pd.notna(r["reply_to_id"])
+                ],
+            },
+            ensure_ascii=False,
+            indent=2,
+        ),
+        encoding="utf-8",
+    )
+    # storyline html embebible (simple): imagen + links
+    storyline_html = f"""
+    <div style="font-family: system-ui; line-height: 1.35">
+      <h3>Storyline</h3>
+      <p><b>Speakers:</b> {metrics["speaker"].nunique()} | <b>Mensajes:</b> {len(metrics)}</p>
+      <img src="file/{storyline_png.name}" style="max-width: 100%; border-radius: 12px;" />
+      <p style="opacity:0.8">Archivos generados en: {out_dir}</p>
+    </div>
+    """
+    # summary
+    summary = (
+        f"Speakers: {metrics['speaker'].nunique()} | Mensajes: {len(metrics)}\n"
+        f"Topic segments: {len(segments)} | Shifts detectados: {len(shifts)}\n"
+        f"Reply_to NULL: {int(metrics['reply_to_id'].isna().sum())}\n"
+        f"Media distancia reply_to: {metrics['reply_distance'].dropna().mean():.2f}\n"
+    )
+    return {
+        "storyline_png": storyline_png,
+        "storyline_html": storyline_html,
+        "metrics_csv": metrics_csv,
+        "interactions_jsonl": interactions_jsonl,
+        "graph_json": graph_json,
+        "summary_text": summary,
+        "fig_sentiment": fig_sent,
+        "fig_bump": fig_bump,
+        "fig_heatmap": fig_heat,
+        "fig_hist_reply_dist": fig_hist,
+        "fig_sankey": fig_sankey,
+    }

conversation_storyline/plots.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from __future__ import annotations
+from typing import Dict, List, Tuple
+import numpy as np
+import pandas as pd
+import plotly.graph_objects as go
+import networkx as nx
+def plot_reply_distance_hist(metrics: pd.DataFrame) -> go.Figure:
+    d = metrics["reply_distance"].dropna().astype(int)
+    fig = go.Figure()
+    fig.add_histogram(x=d, nbinsx=40)
+    fig.update_layout(title="Distribución distancia reply_to (message_id - reply_to_id)", xaxis_title="distancia", yaxis_title="conteo")
+    return fig
+def plot_interaction_heatmap(inter_matrix: pd.DataFrame) -> go.Figure:
+    fig = go.Figure(data=go.Heatmap(z=inter_matrix.values, x=inter_matrix.columns, y=inter_matrix.index))
+    fig.update_layout(title="Heatmap interacciones (conteo respuestas)", xaxis_title="to", yaxis_title="from")
+    return fig
+def plot_bump_activity(metrics: pd.DataFrame) -> go.Figure:
+    # actividad por topic_id y speaker
+    if "topic_id" not in metrics.columns:
+        fig = go.Figure()
+        fig.update_layout(title="Bump actividad (no topic_id)")
+        return fig
+    piv = metrics.pivot_table(index="topic_id", columns="speaker", values="message_id", aggfunc="count", fill_value=0)
+    # rank por segmento (mayor actividad = rank 1)
+    ranks = piv.rank(axis=1, method="average", ascending=False)
+    fig = go.Figure()
+    for sp in piv.columns:
+        fig.add_trace(go.Scatter(x=piv.index, y=ranks[sp], mode="lines+markers", name=sp))
+    fig.update_layout(
+        title="Bump chart: ranking actividad por segmento",
+        xaxis_title="topic_id",
+        yaxis_title="rank (1 = más activo)",
+        yaxis_autorange="reversed",
+    )
+    return fig
+def plot_sankey_speaker_to_topic(metrics: pd.DataFrame) -> go.Figure:
+    if "topic_label" not in metrics.columns:
+        fig = go.Figure()
+        fig.update_layout(title="Sankey (no topic_label)")
+        return fig
+    speakers = metrics["speaker"].unique().tolist()
+    topics = metrics["topic_label"].fillna("Tema").unique().tolist()
+    s_idx = {s: i for i, s in enumerate(speakers)}
+    t_idx = {t: i + len(speakers) for i, t in enumerate(topics)}
+    links = metrics.groupby(["speaker", "topic_label"])["message_id"].count().reset_index()
+    source = [s_idx[r["speaker"]] for _, r in links.iterrows()]
+    target = [t_idx[r["topic_label"]] for _, r in links.iterrows()]
+    value = links["message_id"].tolist()
+    labels = speakers + topics
+    fig = go.Figure(
+        data=[
+            go.Sankey(
+                node=dict(label=labels, pad=10, thickness=12),
+                link=dict(source=source, target=target, value=value),
+            )
+        ]
+    )
+    fig.update_layout(title="Sankey: Speaker → Topic (volumen de mensajes)")
+    return fig
+def plot_sentiment_placeholder(metrics: pd.DataFrame) -> go.Figure:
+    fig = go.Figure()
+    if "sentiment" in metrics.columns and metrics["sentiment"].notna().any():
+        fig.add_trace(go.Scatter(x=metrics["message_id"], y=metrics["sentiment"], mode="lines+markers"))
+    fig.update_layout(title="Sentiment (si disponible)", xaxis_title="message_id", yaxis_title="sentiment")
+    return fig

conversation_storyline/render.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from __future__ import annotations
+from typing import List
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from .config import settings
+def render_storyline_png(metrics: pd.DataFrame, out_png: str, title: str = "Storyline"):
+    """
+    Render estilo “xkcd-level” sencillo:
+    - x = message_id
+    - y = lane (por speaker)
+    """
+    if metrics.empty:
+        fig = plt.figure(figsize=(12, 4))
+        plt.title(title)
+        plt.text(0.5, 0.5, "No data", ha="center", va="center")
+        plt.axis("off")
+        fig.savefig(out_png, dpi=settings.storyline_dpi, bbox_inches="tight")
+        plt.close(fig)
+        return
+    fig = plt.figure(figsize=(14, 6))
+    plt.title(title)
+    for speaker, g in metrics.groupby("speaker"):
+        g = g.sort_values("message_id")
+        x = g["message_id"].to_numpy()
+        y = g["y"].to_numpy()
+        # segmentos: rompe donde y es nan
+        ok = ~np.isnan(y)
+        if ok.sum() < 2:
+            continue
+        plt.plot(x[ok], y[ok], linewidth=2.0, alpha=0.9)
+    plt.yticks([])
+    plt.xlabel("message_id")
+    plt.tight_layout()
+    fig.savefig(out_png, dpi=settings.storyline_dpi, bbox_inches="tight")
+    plt.close(fig)

conversation_storyline/reply_to.py ADDED Viewed

	@@ -0,0 +1,154 @@

+from __future__ import annotations
+from typing import List, Optional, Tuple, Dict
+import numpy as np
+from .schemas import Interaction
+from .config import settings
+from .embeddings import cosine_sim_matrix
+def topk_reply_candidates(
+    i: int,
+    interactions: List[Interaction],
+    E: np.ndarray,
+    window: int,
+    top_k: int,
+) -> List[Tuple[int, float]]:
+    """
+    Devuelve lista (candidate_id, sim) para mensaje i mirando hacia atrás.
+    Heurística:
+    - Ventana de últimos `window`
+    - cosine sim embedding (E ya normalizado)
+    - incluye también: msg anterior sí o sí
+    """
+    if i <= 0:
+        return []
+    start = max(0, i - window)
+    cand_ids = list(range(start, i))
+    sims = cosine_sim_matrix(E[cand_ids], E[i])
+    order = np.argsort(-sims)
+    top = [(cand_ids[idx], float(sims[idx])) for idx in order[:top_k]]
+    # asegurar que el mensaje inmediatamente anterior esté
+    prev = i - 1
+    if prev not in [c for c, _ in top]:
+        prev_sim = float(cosine_sim_matrix(E[[prev]], E[i])[0])
+        top.append((prev, prev_sim))
+        top.sort(key=lambda x: x[1], reverse=True)
+    return top[:top_k]
+def assign_reply_to_offline(
+    interactions: List[Interaction],
+    E: np.ndarray,
+) -> None:
+    """
+    Asignación offline (sin LLM):
+    - elige top1 si sim >= reply_min_sim
+    - si no, reply_to=None
+    - confidence ~ sim
+    """
+    for i in range(len(interactions)):
+        if i == 0:
+            interactions[i].reply_to_id = None
+            interactions[i].confidence_reply = 1.0
+            continue
+        top = topk_reply_candidates(
+            i=i,
+            interactions=interactions,
+            E=E,
+            window=settings.reply_window,
+            top_k=settings.reply_top_k,
+        )
+        if not top:
+            interactions[i].reply_to_id = None
+            interactions[i].confidence_reply = 0.0
+            continue
+        best_id, best_sim = top[0]
+        # ambiguo: si top1-top2 pequeño, offline sigue top1 (LLM puede refinar si se activa)
+        if best_sim < settings.reply_min_sim:
+            interactions[i].reply_to_id = None
+            interactions[i].confidence_reply = float(best_sim)
+        else:
+            interactions[i].reply_to_id = int(best_id)
+            interactions[i].confidence_reply = float(best_sim)
+def needs_llm_refine(i: int, top: List[Tuple[int, float]]) -> bool:
+    """
+    Decide si un caso merece LLM:
+    - baja similitud
+    - ambigüedad top1 vs top2
+    """
+    if not top:
+        return True
+    if top[0][1] < settings.reply_min_sim:
+        return True
+    if len(top) >= 2 and (top[0][1] - top[1][1]) < settings.reply_ambig_delta:
+        return True
+    return False
+def refine_reply_to_with_openai(
+    interactions: List[Interaction],
+    E: np.ndarray,
+    model_name: str,
+    max_refines: Optional[int] = None,
+) -> None:
+    """
+    Refinamiento selectivo con OpenAI:
+    - Solo en mensajes “ambiguos”
+    - Top-K candidatos se pasan al modelo para elegir reply_to_id
+    """
+    from .openai_refiner import pick_reply_to_openai
+    if model_name == "none (offline)":
+        return
+    refined = 0
+    for i in range(1, len(interactions)):
+        top = topk_reply_candidates(
+            i=i,
+            interactions=interactions,
+            E=E,
+            window=settings.reply_window,
+            top_k=settings.reply_top_k,
+        )
+        if not needs_llm_refine(i, top):
+            continue
+        cand_pack = []
+        for cid, sim in top:
+            cand_pack.append(
+                {
+                    "message_id": int(cid),
+                    "speaker": interactions[cid].speaker,
+                    "text": interactions[cid].text[:600],
+                    "sim": float(sim),
+                }
+            )
+        picked = pick_reply_to_openai(
+            model_name=model_name,
+            target={
+                "message_id": int(i),
+                "speaker": interactions[i].speaker,
+                "text": interactions[i].text,
+            },
+            candidates=cand_pack,
+        )
+        interactions[i].reply_to_id = picked.get("reply_to_id", None)
+        interactions[i].confidence_reply = float(picked.get("confidence", interactions[i].confidence_reply or 0.0))
+        refined += 1
+        if max_refines and refined >= max_refines:
+            break

conversation_storyline/schemas.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from dataclasses import dataclass
+from typing import Optional, Dict, Any
+@dataclass
+class Interaction:
+    message_id: int
+    speaker: str
+    text: str
+    # inferred
+    reply_to_id: Optional[int] = None
+    topic_id: Optional[int] = None
+    topic_label: Optional[str] = None
+    # optional metrics
+    sentiment: Optional[float] = None
+    confidence_reply: Optional[float] = None
+@dataclass
+class PipelineArtifacts:
+    out_dir: str
+    storyline_png: str
+    storyline_html: str
+    metrics_csv: str
+    interactions_jsonl: str
+    graph_json: str
+    summary_text: str
+    figs: Dict[str, Any]

conversation_storyline/topic_shifts.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from __future__ import annotations
+from typing import List, Tuple, Dict
+import numpy as np
+import ruptures as rpt
+from .config import settings
+from .schemas import Interaction
+def detect_topic_shifts_ruptures(E: np.ndarray) -> List[int]:
+    """
+    Change-point detection sobre embeddings (normalizados).
+    Devuelve índices de mensaje que INICIAN nuevo segmento (excluye 0).
+    """
+    n = E.shape[0]
+    if n < settings.topic_min_size * 2:
+        return []
+    # Distancia: usamos señal en d dims; rpt trabaja con (n, d)
+    algo = rpt.Pelt(model="rbf", min_size=settings.topic_min_size).fit(E)
+    # Penalización escala con log(n)
+    pen = settings.topic_penalty_scale * np.log(max(n, 2))
+    bkps = algo.predict(pen=pen)  # devuelve endpoints (incluye n)
+    # convertimos endpoints a starts
+    starts = []
+    prev = 0
+    for end in bkps:
+        if end >= n:
+            break
+        if end - prev >= settings.topic_min_size:
+            starts.append(end)
+            prev = end
+    return starts
+def detect_topic_shifts_ensemble(E: np.ndarray) -> List[int]:
+    """
+    Ensemble simple:
+    - ruptures
+    - caídas fuertes de similitud adyacente
+    """
+    shifts = set(detect_topic_shifts_ruptures(E))
+    n = E.shape[0]
+    if n >= 4:
+        adj = np.sum(E[1:] * E[:-1], axis=1)  # cosine sim por estar normalizado
+        thr = float(np.percentile(adj, 10))   # peor 10% como candidatos
+        for i, s in enumerate(adj, start=1):
+            if s <= thr:
+                shifts.add(i)
+    return sorted(x for x in shifts if x > 0 and x < n)
+def build_segments(n_messages: int, shift_starts: List[int]) -> List[Tuple[int, int, int]]:
+    """
+    Devuelve segmentos como (segment_id, start, end_exclusive)
+    """
+    starts = [0] + sorted(set([s for s in shift_starts if 0 < s < n_messages]))
+    ends = starts[1:] + [n_messages]
+    segs = [(sid, s, e) for sid, (s, e) in enumerate(zip(starts, ends))]
+    return segs
+def assign_topics_basic(interactions: List[Interaction], segments: List[Tuple[int, int, int]]) -> None:
+    for seg_id, s, e in segments:
+        for i in range(s, e):
+            interactions[i].topic_id = seg_id
+            interactions[i].topic_label = f"Tema {seg_id}"

requirements.txt CHANGED Viewed

@@ -1,11 +1,13 @@
-gradio
-matplotlib
-scipy
-numpy
-pydantic
-networkx
-openai
-pandas
-plotly
-scikit-learn
-ruptures

+gradio==4.44.1
+numpy==1.26.4
+pandas==2.2.2
+matplotlib==3.9.0
+plotly==5.22.0
+networkx==3.3
+ruptures==1.1.9
+ortools==9.10.4067
+scikit-learn==1.5.1
+sentence-transformers==3.0.1
+# Opcional (solo si quieres refinamiento LLM):
+openai==1.40.8