shahidshaikh's picture
Upload 4 files
ca12042 verified
import json
import os
from dotenv import load_dotenv
load_dotenv()
from typing import TypedDict, Dict, Any, List, Optional
from langgraph.checkpoint.memory import InMemorySaver
from langchain_core.messages import SystemMessage, HumanMessage
from langgraph.prebuilt import create_react_agent
from tools import fetch_papers, save_papers, save_output, read_output, run_clustering, OUT_DIR
# ════════════════════════════════════════════════════════════════
# LLM FACTORY
# ════════════════════════════════════════════════════════════════
def _llm(size: str = "big", require_json: bool = False):
provider = os.getenv("LLM_PROVIDER", "groq").lower()
if provider == "mistral":
from langchain_mistralai import ChatMistralAI
models = {
"big": os.getenv("MISTRAL_BIG", "mistral-large-latest"),
"small": os.getenv("MISTRAL_SMALL", "mistral-small-latest"),
"bulk": os.getenv("MISTRAL_BULK", "open-mixtral-8x22b"),
}
model = ChatMistralAI(
model=models[size],
mistral_api_key=os.getenv("MISTRAL_API_KEY"),
temperature=0,
max_tokens=2000,
)
if require_json:
return model.bind(response_format={"type": "json_object"})
return model
# Default: Groq
from langchain_groq import ChatGroq
models = {
"big": os.getenv("GROQ_BIG", "llama-3.3-70b-versatile"),
"small": os.getenv("GROQ_SMALL", "llama-3.1-8b-instant"),
"bulk": os.getenv("GROQ_BULK", "llama-3.3-70b-versatile"),
}
model = ChatGroq(
model=models[size],
groq_api_key=os.getenv("GROQ_API_KEY"),
temperature=0,
max_tokens=2000,
)
if require_json:
return model.bind(response_format={"type": "json_object"})
return model
# ════════════════════════════════════════════════════════════════
# AGENT SETUP
# ════════════════════════════════════════════════════════════════
system_prompt = """You are an advanced Topic Modelling Agent operating a rigorous thematic analysis pipeline.
You have access to a suite of deterministic python tools to help you process data and store files.
PIPELINE RULES:
1. When asked to "run abstract only" or "run title only", call the `run_clustering` tool.
2. `run_clustering` will return raw HTML cluster mapping and sentences AND save `{mode}_summaries.json` automatically.
3. You MUST analyse these representatives and assign a short 'label' (MAX 3 words capturing the core subject. DO NOT use generic filler like "Focuses on" or "AI in"), 'confidence', and 'reasoning' to each topic.
FEW-SHOT EXAMPLE INPUT: {"0": {"top_sentences": ["Diagnostic ML triage in EU hospitals.", "Automated screening for ER patients."]}}
FEW-SHOT EXAMPLE OUTPUT: {"0": {"label": "Diagnostic Triage Automation", "confidence": "high", "reasoning": "Sentences focus on automated ML screening in clinical settings."}}
4. You MUST save the labels to `{mode}_labels.json` via the `save_output` tool.
STRICT JSON SCHEMA: Your file MUST be a dictionary where keys are topic IDs (strings) and values are objects.
FORMAT: {"0": {"label": "...", "reasoning": "..."}, "1": {...}}
WARNING: DO NOT wrap the dictionary in a top-level "labels" key. DO NOT return a list.
5. Once labels are saved, respond to the user with a message confirming "Phase 2 Complete! You can now review the table in the 'Review Table' tab below."
6. STOP and wait for instructions after Phase 2 is complete.
7. Phase 3: THEMES. When the user asks you to "group" or apply decisions, analyze the topics and formulate them into "Themes".
STRICT THEME SCHEMA: You MUST save themes to `{mode}_themes.json` as a list of objects.
FORMAT: {"themes": [{"theme_id": integer, "name": string, "member_topics": list[int], "sentence_count": integer, "paper_count": integer}]}
WARNING: DO NOT return dictionaries keyed by names. DO NOT return nested arbitrary structures.
8. STOP and wait for review after Phase 3 is complete.
9. Phase 4: PAJAIS. Map the themes to the taxonomy and save output via `save_output` as `{mode}_taxonomy_map.json`.
STRICT PAJAIS SCHEMA: You MUST save output as a dictionary of topics objects.
FORMAT: {"0": {"theme_id": 0, "name": "Theme Name", "pajais_match": "Taxonomy Category", "confidence": "high", "reasoning": "..."}}
10. Phase 6: REPORT. When asked to generate the final report, you MUST generate TWO files using the `save_output` tool.
First: `comparison.csv` mapping abstract vs title themes side-by-side.
Second: `narrative.txt` containing 400-600 words of structured academic text with EXACTLY these headers: "1. Overview of dataset", "2. Key themes identified", "3. Societal implications", "4. PAJAIS mapping insights", "5. Future research directions".
11. Use emojis, format visually, and communicate clearly. Summarize operations explicitly in markdown.
"""
# Compile single unified robust conversational agent
supervisor = create_react_agent(
model=_llm("big"),
tools=[fetch_papers, save_papers, save_output, read_output, run_clustering],
prompt=system_prompt,
checkpointer=InMemorySaver()
)
def chat_with_agent(message: str, thread_id: str):
config = {"configurable": {"thread_id": thread_id}}
res = supervisor.invoke({"messages": [{"role": "user", "content": message}]}, config)
return res["messages"][-1].content