| import json |
| import os |
| from dotenv import load_dotenv |
| load_dotenv() |
|
|
| from typing import TypedDict, Dict, Any, List, Optional |
| from langgraph.checkpoint.memory import InMemorySaver |
| from langchain_core.messages import SystemMessage, HumanMessage |
| from langgraph.prebuilt import create_react_agent |
|
|
| from tools import fetch_papers, save_papers, save_output, read_output, run_clustering, OUT_DIR |
|
|
| |
| |
| |
| def _llm(size: str = "big", require_json: bool = False): |
| provider = os.getenv("LLM_PROVIDER", "groq").lower() |
|
|
| if provider == "mistral": |
| from langchain_mistralai import ChatMistralAI |
| models = { |
| "big": os.getenv("MISTRAL_BIG", "mistral-large-latest"), |
| "small": os.getenv("MISTRAL_SMALL", "mistral-small-latest"), |
| "bulk": os.getenv("MISTRAL_BULK", "open-mixtral-8x22b"), |
| } |
| model = ChatMistralAI( |
| model=models[size], |
| mistral_api_key=os.getenv("MISTRAL_API_KEY"), |
| temperature=0, |
| max_tokens=2000, |
| ) |
| if require_json: |
| return model.bind(response_format={"type": "json_object"}) |
| return model |
|
|
| |
| from langchain_groq import ChatGroq |
| models = { |
| "big": os.getenv("GROQ_BIG", "llama-3.3-70b-versatile"), |
| "small": os.getenv("GROQ_SMALL", "llama-3.1-8b-instant"), |
| "bulk": os.getenv("GROQ_BULK", "llama-3.3-70b-versatile"), |
| } |
| model = ChatGroq( |
| model=models[size], |
| groq_api_key=os.getenv("GROQ_API_KEY"), |
| temperature=0, |
| max_tokens=2000, |
| ) |
| if require_json: |
| return model.bind(response_format={"type": "json_object"}) |
| return model |
|
|
| |
| |
| |
| |
| system_prompt = """You are an advanced Topic Modelling Agent operating a rigorous thematic analysis pipeline. |
| You have access to a suite of deterministic python tools to help you process data and store files. |
| |
| PIPELINE RULES: |
| 1. When asked to "run abstract only" or "run title only", call the `run_clustering` tool. |
| 2. `run_clustering` will return raw HTML cluster mapping and sentences AND save `{mode}_summaries.json` automatically. |
| 3. You MUST analyse these representatives and assign a short 'label' (MAX 3 words capturing the core subject. DO NOT use generic filler like "Focuses on" or "AI in"), 'confidence', and 'reasoning' to each topic. |
| FEW-SHOT EXAMPLE INPUT: {"0": {"top_sentences": ["Diagnostic ML triage in EU hospitals.", "Automated screening for ER patients."]}} |
| FEW-SHOT EXAMPLE OUTPUT: {"0": {"label": "Diagnostic Triage Automation", "confidence": "high", "reasoning": "Sentences focus on automated ML screening in clinical settings."}} |
| 4. You MUST save the labels to `{mode}_labels.json` via the `save_output` tool. |
| STRICT JSON SCHEMA: Your file MUST be a dictionary where keys are topic IDs (strings) and values are objects. |
| FORMAT: {"0": {"label": "...", "reasoning": "..."}, "1": {...}} |
| WARNING: DO NOT wrap the dictionary in a top-level "labels" key. DO NOT return a list. |
| 5. Once labels are saved, respond to the user with a message confirming "Phase 2 Complete! You can now review the table in the 'Review Table' tab below." |
| 6. STOP and wait for instructions after Phase 2 is complete. |
| 7. Phase 3: THEMES. When the user asks you to "group" or apply decisions, analyze the topics and formulate them into "Themes". |
| STRICT THEME SCHEMA: You MUST save themes to `{mode}_themes.json` as a list of objects. |
| FORMAT: {"themes": [{"theme_id": integer, "name": string, "member_topics": list[int], "sentence_count": integer, "paper_count": integer}]} |
| WARNING: DO NOT return dictionaries keyed by names. DO NOT return nested arbitrary structures. |
| 8. STOP and wait for review after Phase 3 is complete. |
| 9. Phase 4: PAJAIS. Map the themes to the taxonomy and save output via `save_output` as `{mode}_taxonomy_map.json`. |
| STRICT PAJAIS SCHEMA: You MUST save output as a dictionary of topics objects. |
| FORMAT: {"0": {"theme_id": 0, "name": "Theme Name", "pajais_match": "Taxonomy Category", "confidence": "high", "reasoning": "..."}} |
| 10. Phase 6: REPORT. When asked to generate the final report, you MUST generate TWO files using the `save_output` tool. |
| First: `comparison.csv` mapping abstract vs title themes side-by-side. |
| Second: `narrative.txt` containing 400-600 words of structured academic text with EXACTLY these headers: "1. Overview of dataset", "2. Key themes identified", "3. Societal implications", "4. PAJAIS mapping insights", "5. Future research directions". |
| 11. Use emojis, format visually, and communicate clearly. Summarize operations explicitly in markdown. |
| """ |
|
|
| |
| supervisor = create_react_agent( |
| model=_llm("big"), |
| tools=[fetch_papers, save_papers, save_output, read_output, run_clustering], |
| prompt=system_prompt, |
| checkpointer=InMemorySaver() |
| ) |
|
|
| def chat_with_agent(message: str, thread_id: str): |
| config = {"configurable": {"thread_id": thread_id}} |
| res = supervisor.invoke({"messages": [{"role": "user", "content": message}]}, config) |
| return res["messages"][-1].content |