Spaces:

MusaR
/

Mini-DeepResearch-Agent

Sleeping

App Files Files Community

MusaR commited on Jun 24, 2025

Commit

20d3dd7

verified ·

1 Parent(s): 0e74258

Upload 9 files

Browse files

Files changed (9) hide show

app.py +166 -0
requirements.txt +13 -0
research_agent/__init__.py +0 -0
research_agent/agent.py +140 -0
research_agent/config.py +18 -0
research_agent/llm_utils.py +22 -0
research_agent/prompts.py +132 -0
research_agent/rag_pipeline.py +67 -0
research_agent/search.py +18 -0

app.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import os
+import gradio as gr
+import google.generativeai as genai
+from tavily import TavilyClient
+from sentence_transformers import SentenceTransformer, CrossEncoder
+from research_agent.config import AgentConfig
+from research_agent.agent import get_clarifying_questions, research_and_plan, write_report_stream
+# --- CSS for styling the Gradio app ---
+CSS = """
+body { font-family: 'Inter', sans-serif; background-color: #F0F2F6; }
+.gradio-container { max-width: 960px !important; margin: auto !important; }
+h1 { text-align: center; font-size: 2.5em; color: #1E3A8A; }
+.gr-button { background-color: #2563EB; color: white; }
+.gr-button:hover { background-color: #1E4ED8; }
+.status_box {
+    background-color: #FFFFFF;
+    border-radius: 8px;
+    padding: 15px;
+    border: 1px solid #E5E7EB;
+    box-shadow: 0 2px 4px rgba(0,0,0,0.05);
+}
+.report_output {
+    background-color: #FFFFFF;
+    border-radius: 8px;
+    padding: 20px;
+    border: 1px solid #E5E7EB;
+    box-shadow: 0 4px 8px rgba(0,0,0,0.05);
+}
+"""
+# --- Global variables for models (to avoid reloading) ---
+writer_model = None
+planner_model = None
+embedding_model = None
+reranker = None
+tavily_client = None
+config = AgentConfig()
+def initialize_models(google_api_key, tavily_api_key):
+    """Initializes all the necessary models and API clients."""
+    global writer_model, planner_model, embedding_model, reranker, tavily_client
+    if not google_api_key or not tavily_api_key:
+        raise gr.Error("API keys are required. Please provide both Google and Tavily API keys.")
+    try:
+        genai.configure(api_key=google_api_key)
+        tavily_client = TavilyClient(api_key=tavily_api_key)
+        if writer_model is None:
+            writer_model = genai.GenerativeModel(config.WRITER_MODEL)
+        if planner_model is None:
+            planner_model = genai.GenerativeModel(config.WRITER_MODEL)
+        if embedding_model is None:
+            embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
+        if reranker is None:
+            reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device='cpu')
+        return "Models initialized successfully!"
+    except Exception as e:
+        raise gr.Error(f"Failed to initialize models. Please check your API keys. Error: {str(e)}")
+def start_research_phase(topic, google_key, tavily_key):
+    """Phase 1: Get user topic and return clarifying questions."""
+    initialize_models(google_key, tavily_key)
+    if not topic:
+        raise gr.Error("Research topic cannot be empty.")
+    questions = get_clarifying_questions(planner_model, topic)
+    # Show the next stage of the UI
+    return {
+        clarification_ui: gr.update(visible=True),
+        clarification_questions_display: gr.update(value=questions),
+        initial_ui: gr.update(visible=False)
+    }
+def generate_report_phase(topic, answers, google_key, tavily_key):
+    """Phase 2: Take answers and generate the full report, streaming progress."""
+    initialize_models(google_key, tavily_key)
+    status_updates = "### Agent Status\n"
+    yield {
+        status_box: gr.update(value=status_updates + "-> Planning research...\n"),
+        final_report: gr.update(value=None)
+    }
+    try:
+        plan = research_and_plan(config, planner_model, tavily_client, topic, answers)
+    except Exception as e:
+        raise gr.Error(f"Failed during planning phase: {e}")
+    status_updates += f"**Research Plan:**\n- **Topic:** {plan['detailed_topic']}\n- **Sections:** {[s.title for s in plan['sections']]}\n\n---\n"
+    yield { status_box: gr.update(value=status_updates) }
+    report_generator = write_report_stream(config, writer_model, tavily_client, embedding_model, reranker, plan)
+    final_report_md = ""
+    for update in report_generator:
+        if isinstance(update, str):
+            final_report_md = update
+        status_updates += update
+        yield { status_box: gr.update(value=status_updates) }
+    yield { final_report: gr.update(value=final_report_md) }
+# --- Build the Gradio Interface ---
+with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as app:
+    gr.Markdown("# Mini DeepSearch Agent")
+    gr.Markdown("This agent performs in-depth research on a given topic, using AI to plan, search, and write a comprehensive report.")
+    # State to hold the original topic
+    topic_state = gr.State()
+    # --- UI Stage 1: Initial Query ---
+    with gr.Box(visible=True) as initial_ui:
+        with gr.Row():
+            google_api_key_input = gr.Textbox(label="Google API Key", type="password", placeholder="Enter your Google AI API Key")
+            tavily_api_key_input = gr.Textbox(label="Tavily API Key", type="password", placeholder="Enter your Tavily Search API Key")
+        topic_input = gr.Textbox(label="Research Topic", placeholder="e.g., The future of renewable energy")
+        start_button = gr.Button("Start Research", variant="primary")
+    # --- UI Stage 2: Clarification ---
+    with gr.Box(visible=False) as clarification_ui:
+        gr.Markdown("### To give you the most relevant report, could you please clarify:")
+        clarification_questions_display = gr.Markdown(elem_classes="status_box")
+        clarification_answers_input = gr.Textbox(label="Your Answers", placeholder="Provide your answers to the questions above to tailor the research...")
+        generate_report_button = gr.Button("Generate Full Report", variant="primary")
+    # --- UI Stage 3: Output ---
+    with gr.Column():
+        status_box = gr.Markdown(elem_classes="status_box", label="Agent Thought Process", visible=False)
+        final_report = gr.Markdown(elem_classes="report_output", label="Final Research Report", visible=False)
+    # --- Event Handlers ---
+    def show_outputs():
+        return {
+            status_box: gr.update(visible=True),
+            final_report: gr.update(visible=True)
+        }
+    start_button.click(
+        fn=start_research_phase,
+        inputs=[topic_input, google_api_key_input, tavily_api_key_input],
+        outputs=[initial_ui, clarification_ui, clarification_questions_display]
+    ).then(
+        fn=lambda topic: topic,
+        inputs=[topic_input],
+        outputs=[topic_state] # Save the topic for the next step
+    )
+    generate_report_button.click(
+        fn=show_outputs,
+        outputs=[status_box, final_report]
+    ).then(
+        fn=generate_report_phase,
+        inputs=[topic_state, clarification_answers_input, google_api_key_input, tavily_api_key_input],
+        outputs=[status_box, final_report]
+    )
+app.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+google-generativeai
+tavily-client
+pydantic
+langchain
+sentence-transformers
+faiss-cpu
+rank_bm25
+transformers
+torch
+ipython
+gradio
+kaggle_secrets
+nest_asyncio

research_agent/__init__.py ADDED Viewed

File without changes

research_agent/agent.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import json
+from pydantic import BaseModel, Field
+from IPython.display import display, Markdown
+# Local module imports
+from .config import AgentConfig
+from . import prompts
+from .llm_utils import run_gemini_json_completion, run_gemini_text_completion
+from .search import gather_research
+from .rag_pipeline import RAGPipeline
+# For running async code in notebook
+import nest_asyncio
+nest_asyncio.apply()
+class Section(BaseModel):
+    title: str = Field(description="The title of the report section.")
+    description: str = Field(description="A detailed description of what the section will cover, including key sub-topics.")
+def run_verification_step(writer_model, section_text: str, research_context: str):
+    """A new step to verify claims and check for hallucinations."""
+    verification_prompt = prompts.verification_prompt_template.format(
+        section_text=section_text,
+        research_context=research_context
+    )
+    verification_result = run_gemini_text_completion(writer_model, verification_prompt, 0.0)
+    if "OK" in verification_result.upper():
+        return section_text
+    else:
+        return f"{section_text}\n\n---\n*Self-Correction Note: An issue was found during verification. The model suggested the following correction: {verification_result}*"
+def get_clarifying_questions(planner_model, initial_topic: str):
+    """Generates clarifying questions for the user."""
+    prompt = prompts.clarification_prompt_template.format(initial_topic=initial_topic)
+    questions = run_gemini_text_completion(planner_model, prompt, 0.5)
+    return questions
+def research_and_plan(config: AgentConfig, planner_model, tavily_client, initial_topic: str, user_answers: str):
+    """Constructs the research brief and generates the report outline."""
+    print("\n--- Step 1: Constructing Detailed Research Brief ---")
+    brief_constructor_prompt = prompts.brief_constructor_prompt_template.format(
+        initial_topic=initial_topic,
+        user_answers=user_answers
+    )
+    detailed_topic = run_gemini_text_completion(planner_model, brief_constructor_prompt, config.PLANNER_TEMPERATURE).strip()
+    print(f"\n--- Step 2: Performing Broad Initial Research for Outline ---")
+    initial_research = gather_research(tavily_client, [detailed_topic], config.INITIAL_SEARCH_RESULTS)
+    planning_context = "\n\n".join(item['content'] for item in initial_research)
+    planner_prompt = prompts.planner_prompt.format(topic=detailed_topic, context=planning_context[:20000])
+    plan_response = run_gemini_json_completion(planner_model, planner_prompt, config.PLANNER_TEMPERATURE)
+    try:
+        initial_sections = [Section(**s) for s in plan_response.get("sections", [])]
+    except Exception as e:
+        raise ValueError(f"Could not create a valid report plan. Error: {e}")
+    if not initial_sections:
+        raise ValueError("Planner returned no sections.")
+    print("\n--- Step 3: Expanding Outline for Deep Research ---")
+    expanded_sections = []
+    for section in initial_sections:
+        expansion_prompt = prompts.expansion_prompt_template.format(section_title=section.title, section_description=section.description)
+        sub_topics_text = run_gemini_text_completion(planner_model, expansion_prompt, 0.6)
+        section.description += "\n\nKey areas to investigate:\n" + sub_topics_text
+        expanded_sections.append(section)
+    return {"detailed_topic": detailed_topic, "sections": expanded_sections}
+def write_report_stream(config: AgentConfig, writer_model, tavily_client, embedding_model, reranker, plan: dict):
+    """Writes the report section by section, yielding progress updates."""
+    detailed_topic = plan["detailed_topic"]
+    sections = plan["sections"]
+    yield f"### Starting Report Generation for: {detailed_topic}\n\n"
+    report_state = {"full_report_text": f"# Deep Research Report: {detailed_topic}\n\n", "all_source_urls": set()}
+    rag_pipeline = RAGPipeline(embedding_model, reranker)
+    for i, section in enumerate(sections):
+        yield f"--- \n### Processing Section {i+1}/{len(sections)}: {section.title}...\n"
+        previous_sections_context = report_state["full_report_text"]
+        section_queries = [f"{detailed_topic} - {section.title}"] + section.description.split('\n')[-3:]
+        section_queries = [q.strip() for q in section_queries if q.strip()]
+        section_queries = [q[:400] for q in section_queries]
+        yield f"-> Searching the web for: `{'`, `'.join(section_queries)}`\n"
+        section_research = gather_research(tavily_client, section_queries, config.DEEP_DIVE_SEARCH_RESULTS)
+        if not section_research:
+            section_content = f"## {section.title}\n\nNo research material could be gathered for this section.\n\n"
+            report_state["full_report_text"] += section_content
+            continue
+        yield f"-> Found {len(section_research)} sources. Indexing for RAG...\n"
+        rag_pipeline.index_research(section_research)
+        top_chunks_with_meta = rag_pipeline.retrieve_and_rerank(section.description, top_k=config.CHUNKS_TO_USE_FOR_WRITING)
+        context_for_llm = ""
+        cited_sources_for_section = {}
+        citation_counter = 1
+        for item in top_chunks_with_meta:
+            source_url = item['source']
+            report_state["all_source_urls"].add(source_url)
+            if source_url not in cited_sources_for_section:
+                cited_sources_for_section[source_url] = citation_counter
+                citation_counter += 1
+            citation_num = cited_sources_for_section[source_url]
+            context_for_llm += f"Source [{citation_num}]: {item['content']}\n\n"
+        bibliography = "\n".join(f"[{num}] {url}" for url, num in cited_sources_for_section.items())
+        yield f"-> Synthesizing and writing section content...\n"
+        writer_prompt = prompts.writer_prompt_template.format(
+            writer_system_instruction=prompts.writer_system_instruction,
+            previous_sections_context=previous_sections_context,
+            section_title=section.title,
+            context_for_llm=context_for_llm
+        )
+        draft_content = run_gemini_text_completion(writer_model, writer_prompt, config.WRITER_TEMPERATURE)
+        yield "-> Fact-checking and verifying section...\n"
+        final_content = run_verification_step(writer_model, draft_content, context_for_llm)
+        final_content_with_sources = f"## {section.title}\n\n{final_content}\n\n**Sources Used in this Section**\n{bibliography}\n\n"
+        report_state["full_report_text"] += final_content_with_sources
+    final_bibliography = "\n".join(f"- {url}" for url in sorted(list(report_state["all_source_urls"])))
+    report_state["full_report_text"] += f"## Master Bibliography\n\n{final_bibliography}"
+    yield "\n--- Report Generation Complete ---\n"
+    return report_state["full_report_text"]

research_agent/config.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from pydantic import BaseModel
+class AgentConfig:
+    """Configuration settings for the Max-Depth agent."""
+    WRITER_MODEL = "gemini-1.5-flash-latest"
+    # Research settings
+    INITIAL_SEARCH_RESULTS = 5
+    DEEP_DIVE_SEARCH_RESULTS = 7
+    # RAG settings
+    CHUNKS_TO_RETRIEVE = 30
+    CHUNKS_TO_USE_FOR_WRITING = 10
+    # LLM settings
+    WRITER_TEMPERATURE = 0.4
+    PLANNER_TEMPERATURE = 0.2
+    NLU_TEMPERATURE = 0.1

research_agent/llm_utils.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import json
+import google.generativeai as genai
+from google.generativeai.types import GenerationConfig, HarmCategory, HarmBlockThreshold
+def run_gemini_json_completion(model, prompt: str, temperature: float):
+    """Runs a Gemini call expecting a JSON response."""
+    try:
+        response = model.generate_content(prompt, generation_config=GenerationConfig(response_mime_type="application/json", temperature=temperature))
+        return json.loads(response.text)
+    except Exception as e:
+        print(f"Warning: Failed to parse JSON from Gemini. Error: {e}")
+        return {}
+def run_gemini_text_completion(model, prompt: str, temperature: float):
+    """Runs a standard Gemini text completion call."""
+    generation_config = GenerationConfig(temperature=temperature)
+    safety_settings = [{"category": HarmCategory.HARM_CATEGORY_HARASSMENT, "threshold": HarmBlockThreshold.BLOCK_ONLY_HIGH}]
+    try:
+        response = model.generate_content(prompt, generation_config=generation_config, safety_settings=safety_settings)
+        return response.text
+    except Exception as e:
+        return f"[Error: Could not generate response. Details: {e}]"

research_agent/prompts.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# The system instruction now has a stronger mandate to use ONLY provided sources.
+writer_system_instruction = """
+You are a distinguished academic researcher. Your primary function is to synthesize information ONLY from the provided research materials.
+You MUST ignore any of your own prior knowledge and base your writing exclusively on the text provided to you.
+You are meticulous about citing your sources. When you make a factual claim, you MUST cite the source.
+"""
+# The planner prompt is now more forceful about using the context.
+planner_prompt = """
+Your task is to create a detailed report outline based on the provided research topic.
+You MUST respond with ONLY a valid JSON object.
+The JSON object must contain a key "sections", which is a list of objects.
+Each object in the "sections" list MUST have two keys: "title" and "description".
+Topic: '{topic}'
+Context: {context}
+Example of a perfect response:
+{
+  "sections": [
+    {
+      "title": "Introduction to Vertical Farming",
+      "description": "A brief overview of the concept, its history, and its relevance in modern agriculture."
+    },
+    {
+      "title": "Key Technologies and Methods",
+      "description": "An exploration of the core technologies like hydroponics, aeroponics, and LED lighting that enable vertical farming."
+    }
+  ]
+}
+"""
+# The writer prompt is now more forceful about citations and ignoring prior knowledge.
+section_writer_prompt = """
+Your task is to write a single, detailed, and analytical section for a research paper on the topic of '{topic}'.
+The section you are writing is: '## {section_title}'
+**CRITICAL INSTRUCTIONS:**
+1.  **USE ONLY PROVIDED SOURCES:** You MUST base your writing entirely on the "Research Material" provided below. Do not add any information from your own knowledge.
+2.  **CITE EVERYTHING:** Every factual statement you make must be followed by an in-text citation in the format `[Source X]`, where 'X' is the number of the source from the list. If a single sentence synthesizes from multiple sources, cite them all (e.g., `[Source 1][Source 3]`).
+3.  **SYNTHESIZE, DON'T SUMMARIZE:** Analyze and connect the information from different sources to build a comprehensive narrative.
+4.  **FORMAL TONE:** Maintain a formal, academic tone.
+**Research Material (Sources are numbered):**
+---
+{research}
+---
+Now, write the complete, cited content for the '{section_title}' section, remembering to cite every fact.
+"""
+# The final section prompt is also made more forceful.
+final_section_writer_prompt = """
+Your task is to write the {section_title} for a research paper on '{topic}'.
+You MUST ONLY use the provided "Main Body Content" to write this section. Do not introduce any new information.
+- For an **Introduction**, set the stage by summarizing the key themes present in the provided body content.
+- For a **Conclusion**, synthesize the findings from the body content and discuss their implications.
+- **At the end of the conclusion text**, add a `### Bibliography` section and list every single URL from the provided `Source URLs for Bibliography`.
+**Main Body Content of the Report:**
+---
+{body_content}
+---
+**Source URLs for Bibliography:**
+---
+{source_urls}
+---
+Now, write the complete content for the '{section_title}' section.
+"""
+# The query writer can remain the same.
+initial_research_prompt = """Generate 3 broad search queries for the topic: '{topic}'. Respond with ONLY a valid JSON object like this: {{"queries": ["query 1", "query 2"]}}"""
+query_writer_prompt = """Generate {num_queries} specific search queries for the report section titled '{section_title}' about '{topic}'. Respond with ONLY a valid JSON object like this: {{"queries": ["query 1", "query 2"]}}"""
+clarification_prompt_template = """
+You are a research assistant. To provide the most relevant report on '{initial_topic}', generate 3-4 clarifying questions for the user.
+These questions should help narrow down the scope, perspective, and focus of the research.
+Present them as a simple, clear, numbered list.
+"""
+brief_constructor_prompt_template = """
+Synthesize the following user request into a single, concise, and factual research topic string.
+- User's Initial Topic: '{initial_topic}'
+- User's Refinements: '{user_answers}'
+RULES: Do NOT add any conversational preamble. The output MUST be a single, clean string suitable for a report title.
+Example Output: A comprehensive analysis of Elon Musk's impact on space exploration and sustainable energy.
+"""
+expansion_prompt_template = "Given the report section '{section_title}: {section_description}', generate 3-5 specific sub-topics or key questions to investigate."
+verification_prompt_template = """
+Here is a draft of a report section and the source material it was based on.
+Your task is to act as a fact-checker. Read the draft and verify three things:
+1. Are there any factual claims in the draft that are NOT supported by the source material?
+2. Are there any misinterpretations of the source material (e.g., confusing a company's sale price with an investment)?
+3. Is the draft free of future-dated or clearly speculative dates presented as fact?
+If all checks pass, respond with "OK".
+If you find an error, respond with a corrected version of the specific sentence or paragraph.
+**DRAFT TO VERIFY:**
+---
+{section_text}
+---
+**SOURCE MATERIAL:**
+---
+{research_context}
+---
+Verification Result:
+"""
+writer_prompt_template = """
+{writer_system_instruction}
+**Report So Far (for context and to avoid repetition):**
+---
+{previous_sections_context}
+---
+Now, using the following research material, write the next section of the report: '## {section_title}'.
+CITE EVERY FACT using [Source X] format. Ensure your writing flows naturally from the 'Report So Far'.
+**Research Material for this Section:**
+---
+{context_for_llm}
+---
+Section Content:
+"""

research_agent/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from typing import List
+import numpy as np
+import faiss
+from rank_bm25 import BM25Okapi
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+class RAGPipeline:
+    """A pipeline for Retrieval-Augmented Generation."""
+    def __init__(self, embedding_model, reranker):
+        self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
+        self.embedding_model = embedding_model
+        self.reranker = reranker
+        self.chunks_with_meta = []
+        self.faiss_index = None
+        self.bm25_index = None
+        self.all_chunks = []
+    def index_research(self, research_items: List[dict]):
+        """Create an index of research material for fast retrieval."""
+        self.chunks_with_meta = []
+        self.all_chunks = []
+        for item in research_items:
+            chunks = self.text_splitter.split_text(item['content'])
+            for chunk in chunks:
+                self.chunks_with_meta.append({'content': chunk, 'source': item['source']})
+                self.all_chunks.append(chunk)
+        if not self.all_chunks:
+            print("Warning: No chunks to index.")
+            return
+        print(f"--> Embedding {len(self.all_chunks)} chunks...")
+        embeddings = self.embedding_model.encode(self.all_chunks, convert_to_tensor=False)
+        self.faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
+        self.faiss_index.add(np.array(embeddings, dtype=np.float32))
+        tokenized_corpus = [doc.split(" ") for doc in self.all_chunks]
+        self.bm25_index = BM25Okapi(tokenized_corpus)
+    def retrieve_and_rerank(self, query: str, top_k: int = 10):
+        """Retrieve relevant chunks and rerank them for the final context."""
+        if not self.chunks_with_meta or self.faiss_index is None or self.bm25_index is None:
+            return []
+        print(f"--> Retrieving and re-ranking for query: '{query[:50]}...'")
+        query_embedding = self.embedding_model.encode([query], convert_to_tensor=False)
+        distances, faiss_indices = self.faiss_index.search(np.array(query_embedding, dtype=np.float32), k=min(top_k * 2, len(self.all_chunks)))
+        tokenized_query = query.split(" ")
+        bm25_scores = self.bm25_index.get_scores(tokenized_query)
+        bm25_indices = np.argsort(bm25_scores)[::-1][:min(top_k * 2, len(self.all_chunks))]
+        combined_indices = set(faiss_indices[0]).union(set(bm25_indices))
+        rerank_pairs = [[query, self.chunks_with_meta[idx]['content']] for idx in combined_indices]
+        if not rerank_pairs:
+            return []
+        scores = self.reranker.predict(rerank_pairs)
+        scored_items = sorted(zip(scores, combined_indices), key=lambda x: x[0], reverse=True)
+        final_results = [self.chunks_with_meta[idx] for score, idx in scored_items[:top_k]]
+        return final_results

research_agent/search.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from typing import List
+from tavily import TavilyClient
+def gather_research(tavily_client: TavilyClient, queries: List[str], num_results: int) -> List[dict]:
+    """
+    Gathers research from Tavily for a list of queries.
+    """
+    research_with_sources = []
+    print(f"-> Gathering research for {len(queries)} queries (max {num_results} results each)...")
+    for query in queries:
+        try:
+            response = tavily_client.search(query=query, search_depth="advanced", max_results=num_results, include_raw_content=True)
+            for result in response['results']:
+                if result.get('content') and result.get('url'):
+                    research_with_sources.append({"content": result['content'], "source": result['url']})
+        except Exception as e:
+            print(f"Tavily search failed for '{query}': {e}")
+    return research_with_sources