Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- app.py +166 -0
- requirements.txt +13 -0
- research_agent/__init__.py +0 -0
- research_agent/agent.py +140 -0
- research_agent/config.py +18 -0
- research_agent/llm_utils.py +22 -0
- research_agent/prompts.py +132 -0
- research_agent/rag_pipeline.py +67 -0
- research_agent/search.py +18 -0
app.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import google.generativeai as genai
|
| 4 |
+
from tavily import TavilyClient
|
| 5 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 6 |
+
|
| 7 |
+
from research_agent.config import AgentConfig
|
| 8 |
+
from research_agent.agent import get_clarifying_questions, research_and_plan, write_report_stream
|
| 9 |
+
|
| 10 |
+
# --- CSS for styling the Gradio app ---
|
| 11 |
+
CSS = """
|
| 12 |
+
body { font-family: 'Inter', sans-serif; background-color: #F0F2F6; }
|
| 13 |
+
.gradio-container { max-width: 960px !important; margin: auto !important; }
|
| 14 |
+
h1 { text-align: center; font-size: 2.5em; color: #1E3A8A; }
|
| 15 |
+
.gr-button { background-color: #2563EB; color: white; }
|
| 16 |
+
.gr-button:hover { background-color: #1E4ED8; }
|
| 17 |
+
.status_box {
|
| 18 |
+
background-color: #FFFFFF;
|
| 19 |
+
border-radius: 8px;
|
| 20 |
+
padding: 15px;
|
| 21 |
+
border: 1px solid #E5E7EB;
|
| 22 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.05);
|
| 23 |
+
}
|
| 24 |
+
.report_output {
|
| 25 |
+
background-color: #FFFFFF;
|
| 26 |
+
border-radius: 8px;
|
| 27 |
+
padding: 20px;
|
| 28 |
+
border: 1px solid #E5E7EB;
|
| 29 |
+
box-shadow: 0 4px 8px rgba(0,0,0,0.05);
|
| 30 |
+
}
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
# --- Global variables for models (to avoid reloading) ---
|
| 34 |
+
writer_model = None
|
| 35 |
+
planner_model = None
|
| 36 |
+
embedding_model = None
|
| 37 |
+
reranker = None
|
| 38 |
+
tavily_client = None
|
| 39 |
+
config = AgentConfig()
|
| 40 |
+
|
| 41 |
+
def initialize_models(google_api_key, tavily_api_key):
|
| 42 |
+
"""Initializes all the necessary models and API clients."""
|
| 43 |
+
global writer_model, planner_model, embedding_model, reranker, tavily_client
|
| 44 |
+
|
| 45 |
+
if not google_api_key or not tavily_api_key:
|
| 46 |
+
raise gr.Error("API keys are required. Please provide both Google and Tavily API keys.")
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
genai.configure(api_key=google_api_key)
|
| 50 |
+
tavily_client = TavilyClient(api_key=tavily_api_key)
|
| 51 |
+
|
| 52 |
+
if writer_model is None:
|
| 53 |
+
writer_model = genai.GenerativeModel(config.WRITER_MODEL)
|
| 54 |
+
if planner_model is None:
|
| 55 |
+
planner_model = genai.GenerativeModel(config.WRITER_MODEL)
|
| 56 |
+
if embedding_model is None:
|
| 57 |
+
embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
|
| 58 |
+
if reranker is None:
|
| 59 |
+
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device='cpu')
|
| 60 |
+
|
| 61 |
+
return "Models initialized successfully!"
|
| 62 |
+
except Exception as e:
|
| 63 |
+
raise gr.Error(f"Failed to initialize models. Please check your API keys. Error: {str(e)}")
|
| 64 |
+
|
| 65 |
+
def start_research_phase(topic, google_key, tavily_key):
|
| 66 |
+
"""Phase 1: Get user topic and return clarifying questions."""
|
| 67 |
+
initialize_models(google_key, tavily_key)
|
| 68 |
+
|
| 69 |
+
if not topic:
|
| 70 |
+
raise gr.Error("Research topic cannot be empty.")
|
| 71 |
+
|
| 72 |
+
questions = get_clarifying_questions(planner_model, topic)
|
| 73 |
+
|
| 74 |
+
# Show the next stage of the UI
|
| 75 |
+
return {
|
| 76 |
+
clarification_ui: gr.update(visible=True),
|
| 77 |
+
clarification_questions_display: gr.update(value=questions),
|
| 78 |
+
initial_ui: gr.update(visible=False)
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
def generate_report_phase(topic, answers, google_key, tavily_key):
|
| 82 |
+
"""Phase 2: Take answers and generate the full report, streaming progress."""
|
| 83 |
+
initialize_models(google_key, tavily_key)
|
| 84 |
+
|
| 85 |
+
status_updates = "### Agent Status\n"
|
| 86 |
+
yield {
|
| 87 |
+
status_box: gr.update(value=status_updates + "-> Planning research...\n"),
|
| 88 |
+
final_report: gr.update(value=None)
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
try:
|
| 92 |
+
plan = research_and_plan(config, planner_model, tavily_client, topic, answers)
|
| 93 |
+
except Exception as e:
|
| 94 |
+
raise gr.Error(f"Failed during planning phase: {e}")
|
| 95 |
+
|
| 96 |
+
status_updates += f"**Research Plan:**\n- **Topic:** {plan['detailed_topic']}\n- **Sections:** {[s.title for s in plan['sections']]}\n\n---\n"
|
| 97 |
+
yield { status_box: gr.update(value=status_updates) }
|
| 98 |
+
|
| 99 |
+
report_generator = write_report_stream(config, writer_model, tavily_client, embedding_model, reranker, plan)
|
| 100 |
+
|
| 101 |
+
final_report_md = ""
|
| 102 |
+
for update in report_generator:
|
| 103 |
+
if isinstance(update, str):
|
| 104 |
+
final_report_md = update
|
| 105 |
+
status_updates += update
|
| 106 |
+
yield { status_box: gr.update(value=status_updates) }
|
| 107 |
+
|
| 108 |
+
yield { final_report: gr.update(value=final_report_md) }
|
| 109 |
+
|
| 110 |
+
# --- Build the Gradio Interface ---
|
| 111 |
+
with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as app:
|
| 112 |
+
|
| 113 |
+
gr.Markdown("# Mini DeepSearch Agent")
|
| 114 |
+
gr.Markdown("This agent performs in-depth research on a given topic, using AI to plan, search, and write a comprehensive report.")
|
| 115 |
+
|
| 116 |
+
# State to hold the original topic
|
| 117 |
+
topic_state = gr.State()
|
| 118 |
+
|
| 119 |
+
# --- UI Stage 1: Initial Query ---
|
| 120 |
+
with gr.Box(visible=True) as initial_ui:
|
| 121 |
+
with gr.Row():
|
| 122 |
+
google_api_key_input = gr.Textbox(label="Google API Key", type="password", placeholder="Enter your Google AI API Key")
|
| 123 |
+
tavily_api_key_input = gr.Textbox(label="Tavily API Key", type="password", placeholder="Enter your Tavily Search API Key")
|
| 124 |
+
|
| 125 |
+
topic_input = gr.Textbox(label="Research Topic", placeholder="e.g., The future of renewable energy")
|
| 126 |
+
start_button = gr.Button("Start Research", variant="primary")
|
| 127 |
+
|
| 128 |
+
# --- UI Stage 2: Clarification ---
|
| 129 |
+
with gr.Box(visible=False) as clarification_ui:
|
| 130 |
+
gr.Markdown("### To give you the most relevant report, could you please clarify:")
|
| 131 |
+
clarification_questions_display = gr.Markdown(elem_classes="status_box")
|
| 132 |
+
clarification_answers_input = gr.Textbox(label="Your Answers", placeholder="Provide your answers to the questions above to tailor the research...")
|
| 133 |
+
generate_report_button = gr.Button("Generate Full Report", variant="primary")
|
| 134 |
+
|
| 135 |
+
# --- UI Stage 3: Output ---
|
| 136 |
+
with gr.Column():
|
| 137 |
+
status_box = gr.Markdown(elem_classes="status_box", label="Agent Thought Process", visible=False)
|
| 138 |
+
final_report = gr.Markdown(elem_classes="report_output", label="Final Research Report", visible=False)
|
| 139 |
+
|
| 140 |
+
# --- Event Handlers ---
|
| 141 |
+
def show_outputs():
|
| 142 |
+
return {
|
| 143 |
+
status_box: gr.update(visible=True),
|
| 144 |
+
final_report: gr.update(visible=True)
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
start_button.click(
|
| 148 |
+
fn=start_research_phase,
|
| 149 |
+
inputs=[topic_input, google_api_key_input, tavily_api_key_input],
|
| 150 |
+
outputs=[initial_ui, clarification_ui, clarification_questions_display]
|
| 151 |
+
).then(
|
| 152 |
+
fn=lambda topic: topic,
|
| 153 |
+
inputs=[topic_input],
|
| 154 |
+
outputs=[topic_state] # Save the topic for the next step
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
generate_report_button.click(
|
| 158 |
+
fn=show_outputs,
|
| 159 |
+
outputs=[status_box, final_report]
|
| 160 |
+
).then(
|
| 161 |
+
fn=generate_report_phase,
|
| 162 |
+
inputs=[topic_state, clarification_answers_input, google_api_key_input, tavily_api_key_input],
|
| 163 |
+
outputs=[status_box, final_report]
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
app.launch(debug=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
google-generativeai
|
| 2 |
+
tavily-client
|
| 3 |
+
pydantic
|
| 4 |
+
langchain
|
| 5 |
+
sentence-transformers
|
| 6 |
+
faiss-cpu
|
| 7 |
+
rank_bm25
|
| 8 |
+
transformers
|
| 9 |
+
torch
|
| 10 |
+
ipython
|
| 11 |
+
gradio
|
| 12 |
+
kaggle_secrets
|
| 13 |
+
nest_asyncio
|
research_agent/__init__.py
ADDED
|
File without changes
|
research_agent/agent.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
from IPython.display import display, Markdown
|
| 5 |
+
|
| 6 |
+
# Local module imports
|
| 7 |
+
from .config import AgentConfig
|
| 8 |
+
from . import prompts
|
| 9 |
+
from .llm_utils import run_gemini_json_completion, run_gemini_text_completion
|
| 10 |
+
from .search import gather_research
|
| 11 |
+
from .rag_pipeline import RAGPipeline
|
| 12 |
+
|
| 13 |
+
# For running async code in notebook
|
| 14 |
+
import nest_asyncio
|
| 15 |
+
nest_asyncio.apply()
|
| 16 |
+
|
| 17 |
+
class Section(BaseModel):
|
| 18 |
+
title: str = Field(description="The title of the report section.")
|
| 19 |
+
description: str = Field(description="A detailed description of what the section will cover, including key sub-topics.")
|
| 20 |
+
|
| 21 |
+
def run_verification_step(writer_model, section_text: str, research_context: str):
|
| 22 |
+
"""A new step to verify claims and check for hallucinations."""
|
| 23 |
+
verification_prompt = prompts.verification_prompt_template.format(
|
| 24 |
+
section_text=section_text,
|
| 25 |
+
research_context=research_context
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
verification_result = run_gemini_text_completion(writer_model, verification_prompt, 0.0)
|
| 29 |
+
|
| 30 |
+
if "OK" in verification_result.upper():
|
| 31 |
+
return section_text
|
| 32 |
+
else:
|
| 33 |
+
return f"{section_text}\n\n---\n*Self-Correction Note: An issue was found during verification. The model suggested the following correction: {verification_result}*"
|
| 34 |
+
|
| 35 |
+
def get_clarifying_questions(planner_model, initial_topic: str):
|
| 36 |
+
"""Generates clarifying questions for the user."""
|
| 37 |
+
prompt = prompts.clarification_prompt_template.format(initial_topic=initial_topic)
|
| 38 |
+
questions = run_gemini_text_completion(planner_model, prompt, 0.5)
|
| 39 |
+
return questions
|
| 40 |
+
|
| 41 |
+
def research_and_plan(config: AgentConfig, planner_model, tavily_client, initial_topic: str, user_answers: str):
|
| 42 |
+
"""Constructs the research brief and generates the report outline."""
|
| 43 |
+
print("\n--- Step 1: Constructing Detailed Research Brief ---")
|
| 44 |
+
brief_constructor_prompt = prompts.brief_constructor_prompt_template.format(
|
| 45 |
+
initial_topic=initial_topic,
|
| 46 |
+
user_answers=user_answers
|
| 47 |
+
)
|
| 48 |
+
detailed_topic = run_gemini_text_completion(planner_model, brief_constructor_prompt, config.PLANNER_TEMPERATURE).strip()
|
| 49 |
+
|
| 50 |
+
print(f"\n--- Step 2: Performing Broad Initial Research for Outline ---")
|
| 51 |
+
initial_research = gather_research(tavily_client, [detailed_topic], config.INITIAL_SEARCH_RESULTS)
|
| 52 |
+
planning_context = "\n\n".join(item['content'] for item in initial_research)
|
| 53 |
+
|
| 54 |
+
planner_prompt = prompts.planner_prompt.format(topic=detailed_topic, context=planning_context[:20000])
|
| 55 |
+
plan_response = run_gemini_json_completion(planner_model, planner_prompt, config.PLANNER_TEMPERATURE)
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
initial_sections = [Section(**s) for s in plan_response.get("sections", [])]
|
| 59 |
+
except Exception as e:
|
| 60 |
+
raise ValueError(f"Could not create a valid report plan. Error: {e}")
|
| 61 |
+
if not initial_sections:
|
| 62 |
+
raise ValueError("Planner returned no sections.")
|
| 63 |
+
|
| 64 |
+
print("\n--- Step 3: Expanding Outline for Deep Research ---")
|
| 65 |
+
expanded_sections = []
|
| 66 |
+
for section in initial_sections:
|
| 67 |
+
expansion_prompt = prompts.expansion_prompt_template.format(section_title=section.title, section_description=section.description)
|
| 68 |
+
sub_topics_text = run_gemini_text_completion(planner_model, expansion_prompt, 0.6)
|
| 69 |
+
section.description += "\n\nKey areas to investigate:\n" + sub_topics_text
|
| 70 |
+
expanded_sections.append(section)
|
| 71 |
+
|
| 72 |
+
return {"detailed_topic": detailed_topic, "sections": expanded_sections}
|
| 73 |
+
|
| 74 |
+
def write_report_stream(config: AgentConfig, writer_model, tavily_client, embedding_model, reranker, plan: dict):
|
| 75 |
+
"""Writes the report section by section, yielding progress updates."""
|
| 76 |
+
|
| 77 |
+
detailed_topic = plan["detailed_topic"]
|
| 78 |
+
sections = plan["sections"]
|
| 79 |
+
|
| 80 |
+
yield f"### Starting Report Generation for: {detailed_topic}\n\n"
|
| 81 |
+
|
| 82 |
+
report_state = {"full_report_text": f"# Deep Research Report: {detailed_topic}\n\n", "all_source_urls": set()}
|
| 83 |
+
rag_pipeline = RAGPipeline(embedding_model, reranker)
|
| 84 |
+
|
| 85 |
+
for i, section in enumerate(sections):
|
| 86 |
+
yield f"--- \n### Processing Section {i+1}/{len(sections)}: {section.title}...\n"
|
| 87 |
+
|
| 88 |
+
previous_sections_context = report_state["full_report_text"]
|
| 89 |
+
|
| 90 |
+
section_queries = [f"{detailed_topic} - {section.title}"] + section.description.split('\n')[-3:]
|
| 91 |
+
section_queries = [q.strip() for q in section_queries if q.strip()]
|
| 92 |
+
section_queries = [q[:400] for q in section_queries]
|
| 93 |
+
|
| 94 |
+
yield f"-> Searching the web for: `{'`, `'.join(section_queries)}`\n"
|
| 95 |
+
section_research = gather_research(tavily_client, section_queries, config.DEEP_DIVE_SEARCH_RESULTS)
|
| 96 |
+
|
| 97 |
+
if not section_research:
|
| 98 |
+
section_content = f"## {section.title}\n\nNo research material could be gathered for this section.\n\n"
|
| 99 |
+
report_state["full_report_text"] += section_content
|
| 100 |
+
continue
|
| 101 |
+
|
| 102 |
+
yield f"-> Found {len(section_research)} sources. Indexing for RAG...\n"
|
| 103 |
+
rag_pipeline.index_research(section_research)
|
| 104 |
+
top_chunks_with_meta = rag_pipeline.retrieve_and_rerank(section.description, top_k=config.CHUNKS_TO_USE_FOR_WRITING)
|
| 105 |
+
|
| 106 |
+
context_for_llm = ""
|
| 107 |
+
cited_sources_for_section = {}
|
| 108 |
+
citation_counter = 1
|
| 109 |
+
for item in top_chunks_with_meta:
|
| 110 |
+
source_url = item['source']
|
| 111 |
+
report_state["all_source_urls"].add(source_url)
|
| 112 |
+
if source_url not in cited_sources_for_section:
|
| 113 |
+
cited_sources_for_section[source_url] = citation_counter
|
| 114 |
+
citation_counter += 1
|
| 115 |
+
citation_num = cited_sources_for_section[source_url]
|
| 116 |
+
context_for_llm += f"Source [{citation_num}]: {item['content']}\n\n"
|
| 117 |
+
|
| 118 |
+
bibliography = "\n".join(f"[{num}] {url}" for url, num in cited_sources_for_section.items())
|
| 119 |
+
|
| 120 |
+
yield f"-> Synthesizing and writing section content...\n"
|
| 121 |
+
writer_prompt = prompts.writer_prompt_template.format(
|
| 122 |
+
writer_system_instruction=prompts.writer_system_instruction,
|
| 123 |
+
previous_sections_context=previous_sections_context,
|
| 124 |
+
section_title=section.title,
|
| 125 |
+
context_for_llm=context_for_llm
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
draft_content = run_gemini_text_completion(writer_model, writer_prompt, config.WRITER_TEMPERATURE)
|
| 129 |
+
|
| 130 |
+
yield "-> Fact-checking and verifying section...\n"
|
| 131 |
+
final_content = run_verification_step(writer_model, draft_content, context_for_llm)
|
| 132 |
+
final_content_with_sources = f"## {section.title}\n\n{final_content}\n\n**Sources Used in this Section**\n{bibliography}\n\n"
|
| 133 |
+
|
| 134 |
+
report_state["full_report_text"] += final_content_with_sources
|
| 135 |
+
|
| 136 |
+
final_bibliography = "\n".join(f"- {url}" for url in sorted(list(report_state["all_source_urls"])))
|
| 137 |
+
report_state["full_report_text"] += f"## Master Bibliography\n\n{final_bibliography}"
|
| 138 |
+
|
| 139 |
+
yield "\n--- Report Generation Complete ---\n"
|
| 140 |
+
return report_state["full_report_text"]
|
research_agent/config.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
|
| 3 |
+
class AgentConfig:
|
| 4 |
+
"""Configuration settings for the Max-Depth agent."""
|
| 5 |
+
WRITER_MODEL = "gemini-1.5-flash-latest"
|
| 6 |
+
|
| 7 |
+
# Research settings
|
| 8 |
+
INITIAL_SEARCH_RESULTS = 5
|
| 9 |
+
DEEP_DIVE_SEARCH_RESULTS = 7
|
| 10 |
+
|
| 11 |
+
# RAG settings
|
| 12 |
+
CHUNKS_TO_RETRIEVE = 30
|
| 13 |
+
CHUNKS_TO_USE_FOR_WRITING = 10
|
| 14 |
+
|
| 15 |
+
# LLM settings
|
| 16 |
+
WRITER_TEMPERATURE = 0.4
|
| 17 |
+
PLANNER_TEMPERATURE = 0.2
|
| 18 |
+
NLU_TEMPERATURE = 0.1
|
research_agent/llm_utils.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import google.generativeai as genai
|
| 3 |
+
from google.generativeai.types import GenerationConfig, HarmCategory, HarmBlockThreshold
|
| 4 |
+
|
| 5 |
+
def run_gemini_json_completion(model, prompt: str, temperature: float):
|
| 6 |
+
"""Runs a Gemini call expecting a JSON response."""
|
| 7 |
+
try:
|
| 8 |
+
response = model.generate_content(prompt, generation_config=GenerationConfig(response_mime_type="application/json", temperature=temperature))
|
| 9 |
+
return json.loads(response.text)
|
| 10 |
+
except Exception as e:
|
| 11 |
+
print(f"Warning: Failed to parse JSON from Gemini. Error: {e}")
|
| 12 |
+
return {}
|
| 13 |
+
|
| 14 |
+
def run_gemini_text_completion(model, prompt: str, temperature: float):
|
| 15 |
+
"""Runs a standard Gemini text completion call."""
|
| 16 |
+
generation_config = GenerationConfig(temperature=temperature)
|
| 17 |
+
safety_settings = [{"category": HarmCategory.HARM_CATEGORY_HARASSMENT, "threshold": HarmBlockThreshold.BLOCK_ONLY_HIGH}]
|
| 18 |
+
try:
|
| 19 |
+
response = model.generate_content(prompt, generation_config=generation_config, safety_settings=safety_settings)
|
| 20 |
+
return response.text
|
| 21 |
+
except Exception as e:
|
| 22 |
+
return f"[Error: Could not generate response. Details: {e}]"
|
research_agent/prompts.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# The system instruction now has a stronger mandate to use ONLY provided sources.
|
| 2 |
+
writer_system_instruction = """
|
| 3 |
+
You are a distinguished academic researcher. Your primary function is to synthesize information ONLY from the provided research materials.
|
| 4 |
+
You MUST ignore any of your own prior knowledge and base your writing exclusively on the text provided to you.
|
| 5 |
+
You are meticulous about citing your sources. When you make a factual claim, you MUST cite the source.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# The planner prompt is now more forceful about using the context.
|
| 9 |
+
planner_prompt = """
|
| 10 |
+
Your task is to create a detailed report outline based on the provided research topic.
|
| 11 |
+
You MUST respond with ONLY a valid JSON object.
|
| 12 |
+
The JSON object must contain a key "sections", which is a list of objects.
|
| 13 |
+
Each object in the "sections" list MUST have two keys: "title" and "description".
|
| 14 |
+
|
| 15 |
+
Topic: '{topic}'
|
| 16 |
+
Context: {context}
|
| 17 |
+
|
| 18 |
+
Example of a perfect response:
|
| 19 |
+
{
|
| 20 |
+
"sections": [
|
| 21 |
+
{
|
| 22 |
+
"title": "Introduction to Vertical Farming",
|
| 23 |
+
"description": "A brief overview of the concept, its history, and its relevance in modern agriculture."
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"title": "Key Technologies and Methods",
|
| 27 |
+
"description": "An exploration of the core technologies like hydroponics, aeroponics, and LED lighting that enable vertical farming."
|
| 28 |
+
}
|
| 29 |
+
]
|
| 30 |
+
}
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
# The writer prompt is now more forceful about citations and ignoring prior knowledge.
|
| 34 |
+
section_writer_prompt = """
|
| 35 |
+
Your task is to write a single, detailed, and analytical section for a research paper on the topic of '{topic}'.
|
| 36 |
+
The section you are writing is: '## {section_title}'
|
| 37 |
+
|
| 38 |
+
**CRITICAL INSTRUCTIONS:**
|
| 39 |
+
1. **USE ONLY PROVIDED SOURCES:** You MUST base your writing entirely on the "Research Material" provided below. Do not add any information from your own knowledge.
|
| 40 |
+
2. **CITE EVERYTHING:** Every factual statement you make must be followed by an in-text citation in the format `[Source X]`, where 'X' is the number of the source from the list. If a single sentence synthesizes from multiple sources, cite them all (e.g., `[Source 1][Source 3]`).
|
| 41 |
+
3. **SYNTHESIZE, DON'T SUMMARIZE:** Analyze and connect the information from different sources to build a comprehensive narrative.
|
| 42 |
+
4. **FORMAL TONE:** Maintain a formal, academic tone.
|
| 43 |
+
|
| 44 |
+
**Research Material (Sources are numbered):**
|
| 45 |
+
---
|
| 46 |
+
{research}
|
| 47 |
+
---
|
| 48 |
+
Now, write the complete, cited content for the '{section_title}' section, remembering to cite every fact.
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
# The final section prompt is also made more forceful.
|
| 52 |
+
final_section_writer_prompt = """
|
| 53 |
+
Your task is to write the {section_title} for a research paper on '{topic}'.
|
| 54 |
+
You MUST ONLY use the provided "Main Body Content" to write this section. Do not introduce any new information.
|
| 55 |
+
|
| 56 |
+
- For an **Introduction**, set the stage by summarizing the key themes present in the provided body content.
|
| 57 |
+
- For a **Conclusion**, synthesize the findings from the body content and discuss their implications.
|
| 58 |
+
- **At the end of the conclusion text**, add a `### Bibliography` section and list every single URL from the provided `Source URLs for Bibliography`.
|
| 59 |
+
|
| 60 |
+
**Main Body Content of the Report:**
|
| 61 |
+
---
|
| 62 |
+
{body_content}
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
**Source URLs for Bibliography:**
|
| 66 |
+
---
|
| 67 |
+
{source_urls}
|
| 68 |
+
---
|
| 69 |
+
Now, write the complete content for the '{section_title}' section.
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
# The query writer can remain the same.
|
| 73 |
+
initial_research_prompt = """Generate 3 broad search queries for the topic: '{topic}'. Respond with ONLY a valid JSON object like this: {{"queries": ["query 1", "query 2"]}}"""
|
| 74 |
+
query_writer_prompt = """Generate {num_queries} specific search queries for the report section titled '{section_title}' about '{topic}'. Respond with ONLY a valid JSON object like this: {{"queries": ["query 1", "query 2"]}}"""
|
| 75 |
+
|
| 76 |
+
clarification_prompt_template = """
|
| 77 |
+
You are a research assistant. To provide the most relevant report on '{initial_topic}', generate 3-4 clarifying questions for the user.
|
| 78 |
+
These questions should help narrow down the scope, perspective, and focus of the research.
|
| 79 |
+
Present them as a simple, clear, numbered list.
|
| 80 |
+
"""
|
| 81 |
+
|
| 82 |
+
brief_constructor_prompt_template = """
|
| 83 |
+
Synthesize the following user request into a single, concise, and factual research topic string.
|
| 84 |
+
- User's Initial Topic: '{initial_topic}'
|
| 85 |
+
- User's Refinements: '{user_answers}'
|
| 86 |
+
RULES: Do NOT add any conversational preamble. The output MUST be a single, clean string suitable for a report title.
|
| 87 |
+
Example Output: A comprehensive analysis of Elon Musk's impact on space exploration and sustainable energy.
|
| 88 |
+
"""
|
| 89 |
+
|
| 90 |
+
expansion_prompt_template = "Given the report section '{section_title}: {section_description}', generate 3-5 specific sub-topics or key questions to investigate."
|
| 91 |
+
|
| 92 |
+
verification_prompt_template = """
|
| 93 |
+
Here is a draft of a report section and the source material it was based on.
|
| 94 |
+
Your task is to act as a fact-checker. Read the draft and verify three things:
|
| 95 |
+
1. Are there any factual claims in the draft that are NOT supported by the source material?
|
| 96 |
+
2. Are there any misinterpretations of the source material (e.g., confusing a company's sale price with an investment)?
|
| 97 |
+
3. Is the draft free of future-dated or clearly speculative dates presented as fact?
|
| 98 |
+
|
| 99 |
+
If all checks pass, respond with "OK".
|
| 100 |
+
If you find an error, respond with a corrected version of the specific sentence or paragraph.
|
| 101 |
+
|
| 102 |
+
**DRAFT TO VERIFY:**
|
| 103 |
+
---
|
| 104 |
+
{section_text}
|
| 105 |
+
---
|
| 106 |
+
|
| 107 |
+
**SOURCE MATERIAL:**
|
| 108 |
+
---
|
| 109 |
+
{research_context}
|
| 110 |
+
---
|
| 111 |
+
|
| 112 |
+
Verification Result:
|
| 113 |
+
"""
|
| 114 |
+
|
| 115 |
+
writer_prompt_template = """
|
| 116 |
+
{writer_system_instruction}
|
| 117 |
+
|
| 118 |
+
**Report So Far (for context and to avoid repetition):**
|
| 119 |
+
---
|
| 120 |
+
{previous_sections_context}
|
| 121 |
+
---
|
| 122 |
+
|
| 123 |
+
Now, using the following research material, write the next section of the report: '## {section_title}'.
|
| 124 |
+
CITE EVERY FACT using [Source X] format. Ensure your writing flows naturally from the 'Report So Far'.
|
| 125 |
+
|
| 126 |
+
**Research Material for this Section:**
|
| 127 |
+
---
|
| 128 |
+
{context_for_llm}
|
| 129 |
+
---
|
| 130 |
+
|
| 131 |
+
Section Content:
|
| 132 |
+
"""
|
research_agent/rag_pipeline.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
import numpy as np
|
| 3 |
+
import faiss
|
| 4 |
+
from rank_bm25 import BM25Okapi
|
| 5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 6 |
+
|
| 7 |
+
class RAGPipeline:
|
| 8 |
+
"""A pipeline for Retrieval-Augmented Generation."""
|
| 9 |
+
def __init__(self, embedding_model, reranker):
|
| 10 |
+
self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
|
| 11 |
+
self.embedding_model = embedding_model
|
| 12 |
+
self.reranker = reranker
|
| 13 |
+
self.chunks_with_meta = []
|
| 14 |
+
self.faiss_index = None
|
| 15 |
+
self.bm25_index = None
|
| 16 |
+
self.all_chunks = []
|
| 17 |
+
|
| 18 |
+
def index_research(self, research_items: List[dict]):
|
| 19 |
+
"""Create an index of research material for fast retrieval."""
|
| 20 |
+
self.chunks_with_meta = []
|
| 21 |
+
self.all_chunks = []
|
| 22 |
+
for item in research_items:
|
| 23 |
+
chunks = self.text_splitter.split_text(item['content'])
|
| 24 |
+
for chunk in chunks:
|
| 25 |
+
self.chunks_with_meta.append({'content': chunk, 'source': item['source']})
|
| 26 |
+
self.all_chunks.append(chunk)
|
| 27 |
+
|
| 28 |
+
if not self.all_chunks:
|
| 29 |
+
print("Warning: No chunks to index.")
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
+
print(f"--> Embedding {len(self.all_chunks)} chunks...")
|
| 33 |
+
embeddings = self.embedding_model.encode(self.all_chunks, convert_to_tensor=False)
|
| 34 |
+
self.faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
|
| 35 |
+
self.faiss_index.add(np.array(embeddings, dtype=np.float32))
|
| 36 |
+
|
| 37 |
+
tokenized_corpus = [doc.split(" ") for doc in self.all_chunks]
|
| 38 |
+
self.bm25_index = BM25Okapi(tokenized_corpus)
|
| 39 |
+
|
| 40 |
+
def retrieve_and_rerank(self, query: str, top_k: int = 10):
|
| 41 |
+
"""Retrieve relevant chunks and rerank them for the final context."""
|
| 42 |
+
if not self.chunks_with_meta or self.faiss_index is None or self.bm25_index is None:
|
| 43 |
+
return []
|
| 44 |
+
|
| 45 |
+
print(f"--> Retrieving and re-ranking for query: '{query[:50]}...'")
|
| 46 |
+
|
| 47 |
+
query_embedding = self.embedding_model.encode([query], convert_to_tensor=False)
|
| 48 |
+
distances, faiss_indices = self.faiss_index.search(np.array(query_embedding, dtype=np.float32), k=min(top_k * 2, len(self.all_chunks)))
|
| 49 |
+
|
| 50 |
+
tokenized_query = query.split(" ")
|
| 51 |
+
bm25_scores = self.bm25_index.get_scores(tokenized_query)
|
| 52 |
+
bm25_indices = np.argsort(bm25_scores)[::-1][:min(top_k * 2, len(self.all_chunks))]
|
| 53 |
+
|
| 54 |
+
combined_indices = set(faiss_indices[0]).union(set(bm25_indices))
|
| 55 |
+
|
| 56 |
+
rerank_pairs = [[query, self.chunks_with_meta[idx]['content']] for idx in combined_indices]
|
| 57 |
+
|
| 58 |
+
if not rerank_pairs:
|
| 59 |
+
return []
|
| 60 |
+
|
| 61 |
+
scores = self.reranker.predict(rerank_pairs)
|
| 62 |
+
|
| 63 |
+
scored_items = sorted(zip(scores, combined_indices), key=lambda x: x[0], reverse=True)
|
| 64 |
+
|
| 65 |
+
final_results = [self.chunks_with_meta[idx] for score, idx in scored_items[:top_k]]
|
| 66 |
+
|
| 67 |
+
return final_results
|
research_agent/search.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
from tavily import TavilyClient
|
| 3 |
+
|
| 4 |
+
def gather_research(tavily_client: TavilyClient, queries: List[str], num_results: int) -> List[dict]:
|
| 5 |
+
"""
|
| 6 |
+
Gathers research from Tavily for a list of queries.
|
| 7 |
+
"""
|
| 8 |
+
research_with_sources = []
|
| 9 |
+
print(f"-> Gathering research for {len(queries)} queries (max {num_results} results each)...")
|
| 10 |
+
for query in queries:
|
| 11 |
+
try:
|
| 12 |
+
response = tavily_client.search(query=query, search_depth="advanced", max_results=num_results, include_raw_content=True)
|
| 13 |
+
for result in response['results']:
|
| 14 |
+
if result.get('content') and result.get('url'):
|
| 15 |
+
research_with_sources.append({"content": result['content'], "source": result['url']})
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print(f"Tavily search failed for '{query}': {e}")
|
| 18 |
+
return research_with_sources
|