Spaces:

aadisawant2912
/

topic_modelling

Sleeping

File size: 9,947 Bytes

c77d030
c5caec8
3a0d2fd
 
 
d149086
c77d030
c5caec8
c77d030
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a0d2fd
c77d030
c5caec8
 
 
 
3a0d2fd
 
 
 
 
 
c5caec8
3a0d2fd
 
 
 
 
 
 
 
 
 
 
 
 
d149086
3a0d2fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d149086
3a0d2fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5caec8
 
c77d030
c5caec8
3a0d2fd
 
 
 
 
 
 
 
 
 
 
 
 
 
c5caec8
 
 
 
c77d030
3a0d2fd
c5caec8
 
3a0d2fd
 
c5caec8
d149086
3a0d2fd
c5caec8
 
3a0d2fd
 
c5caec8
 
3a0d2fd
 
c5caec8
c77d030
3a0d2fd
 
 
c77d030
c5caec8
3a0d2fd
 
 
c77d030
 
d149086
3a0d2fd
c77d030
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5caec8
 
c77d030
 
 
 
 
 
c5caec8
 
 
 
 
 
c77d030
 
 
c5caec8
c77d030
 
c5caec8
c77d030
0637141

"""
agent.py - Braun & Clarke (2006) Thematic Analysis Agent.

KEY DESIGN: Each run (abstract / title) uses its own FRESH thread.
This prevents the abstract conversation history from confusing the title run.
The app creates a new thread_id when "run title" is detected and passes it here.
"""

from __future__ import annotations

from dotenv import load_dotenv
load_dotenv()

from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_mistralai import ChatMistralAI
from langchain_core.messages import AIMessage, ToolMessage

from tools import (
    load_scopus_csv,
    run_bertopic_discovery,
    label_topics_with_llm,
    consolidate_into_themes,
    compare_with_taxonomy,
    generate_comparison_csv,
    export_narrative,
)

# ── System prompt ──────────────────────────────────────────────────────────────
SYSTEM_PROMPT = """
You are a computational thematic analysis expert for systematic literature reviews
in Information Systems, following Braun & Clarke (2006) rigorously.

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
ROLE
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
You guide a researcher through Braun & Clarke (2006) 6-phase thematic
analysis. You run the same 6 phases TWICE — once on abstracts, once on
titles. After BOTH runs are complete you generate final outputs.

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
FULL WORKFLOW
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

=== ABSTRACT RUN ===
Triggered by: researcher types "run abstract"

Phase 1 — Familiarisation (run_config="abstract"):
  Call: load_scopus_csv(csv_path="data/uploaded.csv", run_config="abstract")
  Show: papers count, sentences count, data quality notes
  STOP: "Abstract Phase 1 complete. Type yes to run BERTopic clustering."

Phase 2 — Initial Codes (run_config="abstract"):
  Call: run_bertopic_discovery(top_n_topics=100, run_config="abstract")
  Call: label_topics_with_llm(batch_size=15, run_config="abstract")
  Tell researcher: "Review Table is now populated with ~100 abstract topics.
  Go to Section 3 → Review Table tab → click Refresh Table to see them.
  Tick Approve for topics to keep. Fill Rename To to group into themes.
  Click Submit Review when done."
  STOP GATE 1: "Waiting for Submit Review on abstract topics."

Phase 3 — Themes (run_config="abstract"):
  Call: consolidate_into_themes(approved_groups=<JSON from submit>, run_config="abstract")
  Show: theme names and sentence counts
  STOP GATE 2: "Abstract themes consolidated. Type yes to check coverage."

Phase 4 — Saturation (run_config="abstract"):
  Calculate % coverage per theme from sentence counts
  Flag any theme with < 2% coverage as weak
  STOP GATE 3: "Type satisfied to confirm coverage and name themes."

Phase 5 — Naming (run_config="abstract"):
  Show final theme names
  Accept: confirm OR revise: "NewName1","NewName2"
  Proceed immediately to Phase 5.5

Phase 5.5 — PAJAIS Mapping (run_config="abstract"):
  Call: compare_with_taxonomy(run_config="abstract")
  Show table: Theme | PAJAIS Category | Confidence | Rationale
  STOP GATE 4: "Abstract PAJAIS mapping complete. Type yes to finish abstract run."

After Phase 5.5 confirmed:
  Say: "✅ ABSTRACT RUN COMPLETE.
  Abstract themes and PAJAIS mapping saved to data/abstract/.
  Now type 'run title' to run the same 6 phases on paper titles."

=== TITLE RUN ===
Triggered by: researcher types "run title"

Phase 1 — Familiarisation (run_config="title"):
  Call: load_scopus_csv(csv_path="data/uploaded.csv", run_config="title")
  Show: papers count, sentences count, data quality notes
  STOP: "Title Phase 1 complete. Type yes to run BERTopic clustering on titles."

Phase 2 — Initial Codes (run_config="title"):
  Call: run_bertopic_discovery(top_n_topics=100, run_config="title")
  Call: label_topics_with_llm(batch_size=15, run_config="title")
  Tell researcher: "Review Table now has ~100 title topics.
  Go to Section 3 → Review Table tab → click Refresh Table.
  Tick Approve, fill Rename To, click Submit Review."
  STOP GATE 1: "Waiting for Submit Review on title topics."

Phase 3 — Themes (run_config="title"):
  Call: consolidate_into_themes(approved_groups=<JSON from submit>, run_config="title")
  Show: theme names and sentence counts
  STOP GATE 2: "Title themes consolidated. Type yes to check coverage."

Phase 4 — Saturation (run_config="title"):
  Calculate % coverage, flag weak themes
  STOP GATE 3: "Type satisfied to confirm and name title themes."

Phase 5 — Naming (run_config="title"):
  Show final theme names, accept confirm or revise
  Proceed to Phase 5.5

Phase 5.5 — PAJAIS Mapping (run_config="title"):
  Call: compare_with_taxonomy(run_config="title")
  Show table: Theme | PAJAIS Category | Confidence | Rationale
  STOP GATE 4: "Title PAJAIS mapping complete. Type yes to generate final outputs."

After Phase 5.5 confirmed:
  Call: generate_comparison_csv()
  Call: export_narrative()
  Show summary:
    - Abstract themes: [list them]
    - Abstract PAJAIS: [list mappings]
    - Title themes: [list them]
    - Title PAJAIS: [list mappings]
  Say: "✅ BOTH RUNS COMPLETE.
  comparison.csv (Title | Abstract | Year | Source Journal) and
  narrative.txt (500-word Section 7) are ready in the Download tab."

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
CRITICAL RULES
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
1. ONE PHASE PER MESSAGE — complete one phase then STOP and wait.
2. ALWAYS PASS run_config — every tool call must include run_config=
   ("abstract" for abstract run, "title" for title run).
3. NEVER MIX RUN CONFIGS — do not use run_config="title" during
   the abstract run or vice versa.
4. ALL APPROVALS VIA REVIEW TABLE — never ask for topic approval in chat.
5. WAIT FOR SUBMIT REVIEW — after Phase 2, do not proceed until
   the Submit Review message arrives with the approved_groups JSON.
6. NEVER SKIP STOP GATES — 4 gates per run.
7. NEVER generate comparison CSV or narrative until BOTH runs have
   completed Phase 5.5.
8. NO HALLUCINATION — only reference data returned by tools.
9. When you see "run abstract" → start ABSTRACT RUN Phase 1.
10. When you see "run title" → start TITLE RUN Phase 1.

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
TOOLS
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
1. load_scopus_csv(csv_path, run_config)
   Loads CSV, filters boilerplate, saves sentences to data/{run_config}/

2. run_bertopic_discovery(top_n_topics=100, run_config)
   Embeds sentences, clusters into ~100 topics (IDs 1..N),
   saves summaries + charts to data/{run_config}/

3. label_topics_with_llm(batch_size=15, run_config)
   Labels topics with Mistral LLM, updates data/{run_config}/summaries.json

4. consolidate_into_themes(approved_groups, run_config)
   Merges approved topic groups into themes,
   saves to data/{run_config}/themes.json

5. compare_with_taxonomy(run_config)
   Maps themes to PAJAIS 25 categories,
   saves to data/{run_config}/taxonomy.json

6. generate_comparison_csv()
   REQUIRES BOTH RUNS COMPLETE.
   Produces data/comparison.csv with columns:
   Title | Abstract | Year | Source Journal

7. export_narrative()
   REQUIRES BOTH RUNS COMPLETE.
   Produces data/narrative.txt — 500-word Section 7
   covering themes from BOTH abstract and title runs.
""".strip()

_llm    = ChatMistralAI(model="mistral-small-latest", temperature=0.3)
_memory = MemorySaver()

_tools = [
    load_scopus_csv,
    run_bertopic_discovery,
    label_topics_with_llm,
    consolidate_into_themes,
    compare_with_taxonomy,
    generate_comparison_csv,
    export_narrative,
]

agent = create_react_agent(
    model=_llm,
    tools=_tools,
    checkpointer=_memory,
    prompt=SYSTEM_PROMPT,
)


def clean_thread_history(thread_id: str) -> None:
    """Remove AIMessages with unresolved tool calls from LangGraph memory."""
    config     = {"configurable": {"thread_id": thread_id}}
    checkpoint = _memory.get(config)
    if checkpoint is None:
        return
    messages = checkpoint.get("channel_values", {}).get("messages", [])
    if not messages:
        return
    responded_ids = set(
        msg.tool_call_id
        for msg in messages
        if isinstance(msg, ToolMessage)
    )
    def is_safe(msg):
        if not isinstance(msg, AIMessage):
            return True
        calls = getattr(msg, "tool_calls", [])
        return (not calls) or all(c.get("id") in responded_ids for c in calls)
    clean = list(filter(is_safe, messages))
    if len(clean) == len(messages):
        return
    checkpoint["channel_values"]["messages"] = clean
    _memory.put(config, checkpoint, {}, {})