MusaR commited on
Commit
20d3dd7
·
verified ·
1 Parent(s): 0e74258

Upload 9 files

Browse files
app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import google.generativeai as genai
4
+ from tavily import TavilyClient
5
+ from sentence_transformers import SentenceTransformer, CrossEncoder
6
+
7
+ from research_agent.config import AgentConfig
8
+ from research_agent.agent import get_clarifying_questions, research_and_plan, write_report_stream
9
+
10
+ # --- CSS for styling the Gradio app ---
11
+ CSS = """
12
+ body { font-family: 'Inter', sans-serif; background-color: #F0F2F6; }
13
+ .gradio-container { max-width: 960px !important; margin: auto !important; }
14
+ h1 { text-align: center; font-size: 2.5em; color: #1E3A8A; }
15
+ .gr-button { background-color: #2563EB; color: white; }
16
+ .gr-button:hover { background-color: #1E4ED8; }
17
+ .status_box {
18
+ background-color: #FFFFFF;
19
+ border-radius: 8px;
20
+ padding: 15px;
21
+ border: 1px solid #E5E7EB;
22
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
23
+ }
24
+ .report_output {
25
+ background-color: #FFFFFF;
26
+ border-radius: 8px;
27
+ padding: 20px;
28
+ border: 1px solid #E5E7EB;
29
+ box-shadow: 0 4px 8px rgba(0,0,0,0.05);
30
+ }
31
+ """
32
+
33
+ # --- Global variables for models (to avoid reloading) ---
34
+ writer_model = None
35
+ planner_model = None
36
+ embedding_model = None
37
+ reranker = None
38
+ tavily_client = None
39
+ config = AgentConfig()
40
+
41
+ def initialize_models(google_api_key, tavily_api_key):
42
+ """Initializes all the necessary models and API clients."""
43
+ global writer_model, planner_model, embedding_model, reranker, tavily_client
44
+
45
+ if not google_api_key or not tavily_api_key:
46
+ raise gr.Error("API keys are required. Please provide both Google and Tavily API keys.")
47
+
48
+ try:
49
+ genai.configure(api_key=google_api_key)
50
+ tavily_client = TavilyClient(api_key=tavily_api_key)
51
+
52
+ if writer_model is None:
53
+ writer_model = genai.GenerativeModel(config.WRITER_MODEL)
54
+ if planner_model is None:
55
+ planner_model = genai.GenerativeModel(config.WRITER_MODEL)
56
+ if embedding_model is None:
57
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
58
+ if reranker is None:
59
+ reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device='cpu')
60
+
61
+ return "Models initialized successfully!"
62
+ except Exception as e:
63
+ raise gr.Error(f"Failed to initialize models. Please check your API keys. Error: {str(e)}")
64
+
65
+ def start_research_phase(topic, google_key, tavily_key):
66
+ """Phase 1: Get user topic and return clarifying questions."""
67
+ initialize_models(google_key, tavily_key)
68
+
69
+ if not topic:
70
+ raise gr.Error("Research topic cannot be empty.")
71
+
72
+ questions = get_clarifying_questions(planner_model, topic)
73
+
74
+ # Show the next stage of the UI
75
+ return {
76
+ clarification_ui: gr.update(visible=True),
77
+ clarification_questions_display: gr.update(value=questions),
78
+ initial_ui: gr.update(visible=False)
79
+ }
80
+
81
+ def generate_report_phase(topic, answers, google_key, tavily_key):
82
+ """Phase 2: Take answers and generate the full report, streaming progress."""
83
+ initialize_models(google_key, tavily_key)
84
+
85
+ status_updates = "### Agent Status\n"
86
+ yield {
87
+ status_box: gr.update(value=status_updates + "-> Planning research...\n"),
88
+ final_report: gr.update(value=None)
89
+ }
90
+
91
+ try:
92
+ plan = research_and_plan(config, planner_model, tavily_client, topic, answers)
93
+ except Exception as e:
94
+ raise gr.Error(f"Failed during planning phase: {e}")
95
+
96
+ status_updates += f"**Research Plan:**\n- **Topic:** {plan['detailed_topic']}\n- **Sections:** {[s.title for s in plan['sections']]}\n\n---\n"
97
+ yield { status_box: gr.update(value=status_updates) }
98
+
99
+ report_generator = write_report_stream(config, writer_model, tavily_client, embedding_model, reranker, plan)
100
+
101
+ final_report_md = ""
102
+ for update in report_generator:
103
+ if isinstance(update, str):
104
+ final_report_md = update
105
+ status_updates += update
106
+ yield { status_box: gr.update(value=status_updates) }
107
+
108
+ yield { final_report: gr.update(value=final_report_md) }
109
+
110
+ # --- Build the Gradio Interface ---
111
+ with gr.Blocks(css=CSS, theme=gr.themes.Soft()) as app:
112
+
113
+ gr.Markdown("# Mini DeepSearch Agent")
114
+ gr.Markdown("This agent performs in-depth research on a given topic, using AI to plan, search, and write a comprehensive report.")
115
+
116
+ # State to hold the original topic
117
+ topic_state = gr.State()
118
+
119
+ # --- UI Stage 1: Initial Query ---
120
+ with gr.Box(visible=True) as initial_ui:
121
+ with gr.Row():
122
+ google_api_key_input = gr.Textbox(label="Google API Key", type="password", placeholder="Enter your Google AI API Key")
123
+ tavily_api_key_input = gr.Textbox(label="Tavily API Key", type="password", placeholder="Enter your Tavily Search API Key")
124
+
125
+ topic_input = gr.Textbox(label="Research Topic", placeholder="e.g., The future of renewable energy")
126
+ start_button = gr.Button("Start Research", variant="primary")
127
+
128
+ # --- UI Stage 2: Clarification ---
129
+ with gr.Box(visible=False) as clarification_ui:
130
+ gr.Markdown("### To give you the most relevant report, could you please clarify:")
131
+ clarification_questions_display = gr.Markdown(elem_classes="status_box")
132
+ clarification_answers_input = gr.Textbox(label="Your Answers", placeholder="Provide your answers to the questions above to tailor the research...")
133
+ generate_report_button = gr.Button("Generate Full Report", variant="primary")
134
+
135
+ # --- UI Stage 3: Output ---
136
+ with gr.Column():
137
+ status_box = gr.Markdown(elem_classes="status_box", label="Agent Thought Process", visible=False)
138
+ final_report = gr.Markdown(elem_classes="report_output", label="Final Research Report", visible=False)
139
+
140
+ # --- Event Handlers ---
141
+ def show_outputs():
142
+ return {
143
+ status_box: gr.update(visible=True),
144
+ final_report: gr.update(visible=True)
145
+ }
146
+
147
+ start_button.click(
148
+ fn=start_research_phase,
149
+ inputs=[topic_input, google_api_key_input, tavily_api_key_input],
150
+ outputs=[initial_ui, clarification_ui, clarification_questions_display]
151
+ ).then(
152
+ fn=lambda topic: topic,
153
+ inputs=[topic_input],
154
+ outputs=[topic_state] # Save the topic for the next step
155
+ )
156
+
157
+ generate_report_button.click(
158
+ fn=show_outputs,
159
+ outputs=[status_box, final_report]
160
+ ).then(
161
+ fn=generate_report_phase,
162
+ inputs=[topic_state, clarification_answers_input, google_api_key_input, tavily_api_key_input],
163
+ outputs=[status_box, final_report]
164
+ )
165
+
166
+ app.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ google-generativeai
2
+ tavily-client
3
+ pydantic
4
+ langchain
5
+ sentence-transformers
6
+ faiss-cpu
7
+ rank_bm25
8
+ transformers
9
+ torch
10
+ ipython
11
+ gradio
12
+ kaggle_secrets
13
+ nest_asyncio
research_agent/__init__.py ADDED
File without changes
research_agent/agent.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from pydantic import BaseModel, Field
4
+ from IPython.display import display, Markdown
5
+
6
+ # Local module imports
7
+ from .config import AgentConfig
8
+ from . import prompts
9
+ from .llm_utils import run_gemini_json_completion, run_gemini_text_completion
10
+ from .search import gather_research
11
+ from .rag_pipeline import RAGPipeline
12
+
13
+ # For running async code in notebook
14
+ import nest_asyncio
15
+ nest_asyncio.apply()
16
+
17
+ class Section(BaseModel):
18
+ title: str = Field(description="The title of the report section.")
19
+ description: str = Field(description="A detailed description of what the section will cover, including key sub-topics.")
20
+
21
+ def run_verification_step(writer_model, section_text: str, research_context: str):
22
+ """A new step to verify claims and check for hallucinations."""
23
+ verification_prompt = prompts.verification_prompt_template.format(
24
+ section_text=section_text,
25
+ research_context=research_context
26
+ )
27
+
28
+ verification_result = run_gemini_text_completion(writer_model, verification_prompt, 0.0)
29
+
30
+ if "OK" in verification_result.upper():
31
+ return section_text
32
+ else:
33
+ return f"{section_text}\n\n---\n*Self-Correction Note: An issue was found during verification. The model suggested the following correction: {verification_result}*"
34
+
35
+ def get_clarifying_questions(planner_model, initial_topic: str):
36
+ """Generates clarifying questions for the user."""
37
+ prompt = prompts.clarification_prompt_template.format(initial_topic=initial_topic)
38
+ questions = run_gemini_text_completion(planner_model, prompt, 0.5)
39
+ return questions
40
+
41
+ def research_and_plan(config: AgentConfig, planner_model, tavily_client, initial_topic: str, user_answers: str):
42
+ """Constructs the research brief and generates the report outline."""
43
+ print("\n--- Step 1: Constructing Detailed Research Brief ---")
44
+ brief_constructor_prompt = prompts.brief_constructor_prompt_template.format(
45
+ initial_topic=initial_topic,
46
+ user_answers=user_answers
47
+ )
48
+ detailed_topic = run_gemini_text_completion(planner_model, brief_constructor_prompt, config.PLANNER_TEMPERATURE).strip()
49
+
50
+ print(f"\n--- Step 2: Performing Broad Initial Research for Outline ---")
51
+ initial_research = gather_research(tavily_client, [detailed_topic], config.INITIAL_SEARCH_RESULTS)
52
+ planning_context = "\n\n".join(item['content'] for item in initial_research)
53
+
54
+ planner_prompt = prompts.planner_prompt.format(topic=detailed_topic, context=planning_context[:20000])
55
+ plan_response = run_gemini_json_completion(planner_model, planner_prompt, config.PLANNER_TEMPERATURE)
56
+
57
+ try:
58
+ initial_sections = [Section(**s) for s in plan_response.get("sections", [])]
59
+ except Exception as e:
60
+ raise ValueError(f"Could not create a valid report plan. Error: {e}")
61
+ if not initial_sections:
62
+ raise ValueError("Planner returned no sections.")
63
+
64
+ print("\n--- Step 3: Expanding Outline for Deep Research ---")
65
+ expanded_sections = []
66
+ for section in initial_sections:
67
+ expansion_prompt = prompts.expansion_prompt_template.format(section_title=section.title, section_description=section.description)
68
+ sub_topics_text = run_gemini_text_completion(planner_model, expansion_prompt, 0.6)
69
+ section.description += "\n\nKey areas to investigate:\n" + sub_topics_text
70
+ expanded_sections.append(section)
71
+
72
+ return {"detailed_topic": detailed_topic, "sections": expanded_sections}
73
+
74
+ def write_report_stream(config: AgentConfig, writer_model, tavily_client, embedding_model, reranker, plan: dict):
75
+ """Writes the report section by section, yielding progress updates."""
76
+
77
+ detailed_topic = plan["detailed_topic"]
78
+ sections = plan["sections"]
79
+
80
+ yield f"### Starting Report Generation for: {detailed_topic}\n\n"
81
+
82
+ report_state = {"full_report_text": f"# Deep Research Report: {detailed_topic}\n\n", "all_source_urls": set()}
83
+ rag_pipeline = RAGPipeline(embedding_model, reranker)
84
+
85
+ for i, section in enumerate(sections):
86
+ yield f"--- \n### Processing Section {i+1}/{len(sections)}: {section.title}...\n"
87
+
88
+ previous_sections_context = report_state["full_report_text"]
89
+
90
+ section_queries = [f"{detailed_topic} - {section.title}"] + section.description.split('\n')[-3:]
91
+ section_queries = [q.strip() for q in section_queries if q.strip()]
92
+ section_queries = [q[:400] for q in section_queries]
93
+
94
+ yield f"-> Searching the web for: `{'`, `'.join(section_queries)}`\n"
95
+ section_research = gather_research(tavily_client, section_queries, config.DEEP_DIVE_SEARCH_RESULTS)
96
+
97
+ if not section_research:
98
+ section_content = f"## {section.title}\n\nNo research material could be gathered for this section.\n\n"
99
+ report_state["full_report_text"] += section_content
100
+ continue
101
+
102
+ yield f"-> Found {len(section_research)} sources. Indexing for RAG...\n"
103
+ rag_pipeline.index_research(section_research)
104
+ top_chunks_with_meta = rag_pipeline.retrieve_and_rerank(section.description, top_k=config.CHUNKS_TO_USE_FOR_WRITING)
105
+
106
+ context_for_llm = ""
107
+ cited_sources_for_section = {}
108
+ citation_counter = 1
109
+ for item in top_chunks_with_meta:
110
+ source_url = item['source']
111
+ report_state["all_source_urls"].add(source_url)
112
+ if source_url not in cited_sources_for_section:
113
+ cited_sources_for_section[source_url] = citation_counter
114
+ citation_counter += 1
115
+ citation_num = cited_sources_for_section[source_url]
116
+ context_for_llm += f"Source [{citation_num}]: {item['content']}\n\n"
117
+
118
+ bibliography = "\n".join(f"[{num}] {url}" for url, num in cited_sources_for_section.items())
119
+
120
+ yield f"-> Synthesizing and writing section content...\n"
121
+ writer_prompt = prompts.writer_prompt_template.format(
122
+ writer_system_instruction=prompts.writer_system_instruction,
123
+ previous_sections_context=previous_sections_context,
124
+ section_title=section.title,
125
+ context_for_llm=context_for_llm
126
+ )
127
+
128
+ draft_content = run_gemini_text_completion(writer_model, writer_prompt, config.WRITER_TEMPERATURE)
129
+
130
+ yield "-> Fact-checking and verifying section...\n"
131
+ final_content = run_verification_step(writer_model, draft_content, context_for_llm)
132
+ final_content_with_sources = f"## {section.title}\n\n{final_content}\n\n**Sources Used in this Section**\n{bibliography}\n\n"
133
+
134
+ report_state["full_report_text"] += final_content_with_sources
135
+
136
+ final_bibliography = "\n".join(f"- {url}" for url in sorted(list(report_state["all_source_urls"])))
137
+ report_state["full_report_text"] += f"## Master Bibliography\n\n{final_bibliography}"
138
+
139
+ yield "\n--- Report Generation Complete ---\n"
140
+ return report_state["full_report_text"]
research_agent/config.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+
3
+ class AgentConfig:
4
+ """Configuration settings for the Max-Depth agent."""
5
+ WRITER_MODEL = "gemini-1.5-flash-latest"
6
+
7
+ # Research settings
8
+ INITIAL_SEARCH_RESULTS = 5
9
+ DEEP_DIVE_SEARCH_RESULTS = 7
10
+
11
+ # RAG settings
12
+ CHUNKS_TO_RETRIEVE = 30
13
+ CHUNKS_TO_USE_FOR_WRITING = 10
14
+
15
+ # LLM settings
16
+ WRITER_TEMPERATURE = 0.4
17
+ PLANNER_TEMPERATURE = 0.2
18
+ NLU_TEMPERATURE = 0.1
research_agent/llm_utils.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import google.generativeai as genai
3
+ from google.generativeai.types import GenerationConfig, HarmCategory, HarmBlockThreshold
4
+
5
+ def run_gemini_json_completion(model, prompt: str, temperature: float):
6
+ """Runs a Gemini call expecting a JSON response."""
7
+ try:
8
+ response = model.generate_content(prompt, generation_config=GenerationConfig(response_mime_type="application/json", temperature=temperature))
9
+ return json.loads(response.text)
10
+ except Exception as e:
11
+ print(f"Warning: Failed to parse JSON from Gemini. Error: {e}")
12
+ return {}
13
+
14
+ def run_gemini_text_completion(model, prompt: str, temperature: float):
15
+ """Runs a standard Gemini text completion call."""
16
+ generation_config = GenerationConfig(temperature=temperature)
17
+ safety_settings = [{"category": HarmCategory.HARM_CATEGORY_HARASSMENT, "threshold": HarmBlockThreshold.BLOCK_ONLY_HIGH}]
18
+ try:
19
+ response = model.generate_content(prompt, generation_config=generation_config, safety_settings=safety_settings)
20
+ return response.text
21
+ except Exception as e:
22
+ return f"[Error: Could not generate response. Details: {e}]"
research_agent/prompts.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The system instruction now has a stronger mandate to use ONLY provided sources.
2
+ writer_system_instruction = """
3
+ You are a distinguished academic researcher. Your primary function is to synthesize information ONLY from the provided research materials.
4
+ You MUST ignore any of your own prior knowledge and base your writing exclusively on the text provided to you.
5
+ You are meticulous about citing your sources. When you make a factual claim, you MUST cite the source.
6
+ """
7
+
8
+ # The planner prompt is now more forceful about using the context.
9
+ planner_prompt = """
10
+ Your task is to create a detailed report outline based on the provided research topic.
11
+ You MUST respond with ONLY a valid JSON object.
12
+ The JSON object must contain a key "sections", which is a list of objects.
13
+ Each object in the "sections" list MUST have two keys: "title" and "description".
14
+
15
+ Topic: '{topic}'
16
+ Context: {context}
17
+
18
+ Example of a perfect response:
19
+ {
20
+ "sections": [
21
+ {
22
+ "title": "Introduction to Vertical Farming",
23
+ "description": "A brief overview of the concept, its history, and its relevance in modern agriculture."
24
+ },
25
+ {
26
+ "title": "Key Technologies and Methods",
27
+ "description": "An exploration of the core technologies like hydroponics, aeroponics, and LED lighting that enable vertical farming."
28
+ }
29
+ ]
30
+ }
31
+ """
32
+
33
+ # The writer prompt is now more forceful about citations and ignoring prior knowledge.
34
+ section_writer_prompt = """
35
+ Your task is to write a single, detailed, and analytical section for a research paper on the topic of '{topic}'.
36
+ The section you are writing is: '## {section_title}'
37
+
38
+ **CRITICAL INSTRUCTIONS:**
39
+ 1. **USE ONLY PROVIDED SOURCES:** You MUST base your writing entirely on the "Research Material" provided below. Do not add any information from your own knowledge.
40
+ 2. **CITE EVERYTHING:** Every factual statement you make must be followed by an in-text citation in the format `[Source X]`, where 'X' is the number of the source from the list. If a single sentence synthesizes from multiple sources, cite them all (e.g., `[Source 1][Source 3]`).
41
+ 3. **SYNTHESIZE, DON'T SUMMARIZE:** Analyze and connect the information from different sources to build a comprehensive narrative.
42
+ 4. **FORMAL TONE:** Maintain a formal, academic tone.
43
+
44
+ **Research Material (Sources are numbered):**
45
+ ---
46
+ {research}
47
+ ---
48
+ Now, write the complete, cited content for the '{section_title}' section, remembering to cite every fact.
49
+ """
50
+
51
+ # The final section prompt is also made more forceful.
52
+ final_section_writer_prompt = """
53
+ Your task is to write the {section_title} for a research paper on '{topic}'.
54
+ You MUST ONLY use the provided "Main Body Content" to write this section. Do not introduce any new information.
55
+
56
+ - For an **Introduction**, set the stage by summarizing the key themes present in the provided body content.
57
+ - For a **Conclusion**, synthesize the findings from the body content and discuss their implications.
58
+ - **At the end of the conclusion text**, add a `### Bibliography` section and list every single URL from the provided `Source URLs for Bibliography`.
59
+
60
+ **Main Body Content of the Report:**
61
+ ---
62
+ {body_content}
63
+ ---
64
+
65
+ **Source URLs for Bibliography:**
66
+ ---
67
+ {source_urls}
68
+ ---
69
+ Now, write the complete content for the '{section_title}' section.
70
+ """
71
+
72
+ # The query writer can remain the same.
73
+ initial_research_prompt = """Generate 3 broad search queries for the topic: '{topic}'. Respond with ONLY a valid JSON object like this: {{"queries": ["query 1", "query 2"]}}"""
74
+ query_writer_prompt = """Generate {num_queries} specific search queries for the report section titled '{section_title}' about '{topic}'. Respond with ONLY a valid JSON object like this: {{"queries": ["query 1", "query 2"]}}"""
75
+
76
+ clarification_prompt_template = """
77
+ You are a research assistant. To provide the most relevant report on '{initial_topic}', generate 3-4 clarifying questions for the user.
78
+ These questions should help narrow down the scope, perspective, and focus of the research.
79
+ Present them as a simple, clear, numbered list.
80
+ """
81
+
82
+ brief_constructor_prompt_template = """
83
+ Synthesize the following user request into a single, concise, and factual research topic string.
84
+ - User's Initial Topic: '{initial_topic}'
85
+ - User's Refinements: '{user_answers}'
86
+ RULES: Do NOT add any conversational preamble. The output MUST be a single, clean string suitable for a report title.
87
+ Example Output: A comprehensive analysis of Elon Musk's impact on space exploration and sustainable energy.
88
+ """
89
+
90
+ expansion_prompt_template = "Given the report section '{section_title}: {section_description}', generate 3-5 specific sub-topics or key questions to investigate."
91
+
92
+ verification_prompt_template = """
93
+ Here is a draft of a report section and the source material it was based on.
94
+ Your task is to act as a fact-checker. Read the draft and verify three things:
95
+ 1. Are there any factual claims in the draft that are NOT supported by the source material?
96
+ 2. Are there any misinterpretations of the source material (e.g., confusing a company's sale price with an investment)?
97
+ 3. Is the draft free of future-dated or clearly speculative dates presented as fact?
98
+
99
+ If all checks pass, respond with "OK".
100
+ If you find an error, respond with a corrected version of the specific sentence or paragraph.
101
+
102
+ **DRAFT TO VERIFY:**
103
+ ---
104
+ {section_text}
105
+ ---
106
+
107
+ **SOURCE MATERIAL:**
108
+ ---
109
+ {research_context}
110
+ ---
111
+
112
+ Verification Result:
113
+ """
114
+
115
+ writer_prompt_template = """
116
+ {writer_system_instruction}
117
+
118
+ **Report So Far (for context and to avoid repetition):**
119
+ ---
120
+ {previous_sections_context}
121
+ ---
122
+
123
+ Now, using the following research material, write the next section of the report: '## {section_title}'.
124
+ CITE EVERY FACT using [Source X] format. Ensure your writing flows naturally from the 'Report So Far'.
125
+
126
+ **Research Material for this Section:**
127
+ ---
128
+ {context_for_llm}
129
+ ---
130
+
131
+ Section Content:
132
+ """
research_agent/rag_pipeline.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import numpy as np
3
+ import faiss
4
+ from rank_bm25 import BM25Okapi
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+
7
+ class RAGPipeline:
8
+ """A pipeline for Retrieval-Augmented Generation."""
9
+ def __init__(self, embedding_model, reranker):
10
+ self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
11
+ self.embedding_model = embedding_model
12
+ self.reranker = reranker
13
+ self.chunks_with_meta = []
14
+ self.faiss_index = None
15
+ self.bm25_index = None
16
+ self.all_chunks = []
17
+
18
+ def index_research(self, research_items: List[dict]):
19
+ """Create an index of research material for fast retrieval."""
20
+ self.chunks_with_meta = []
21
+ self.all_chunks = []
22
+ for item in research_items:
23
+ chunks = self.text_splitter.split_text(item['content'])
24
+ for chunk in chunks:
25
+ self.chunks_with_meta.append({'content': chunk, 'source': item['source']})
26
+ self.all_chunks.append(chunk)
27
+
28
+ if not self.all_chunks:
29
+ print("Warning: No chunks to index.")
30
+ return
31
+
32
+ print(f"--> Embedding {len(self.all_chunks)} chunks...")
33
+ embeddings = self.embedding_model.encode(self.all_chunks, convert_to_tensor=False)
34
+ self.faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
35
+ self.faiss_index.add(np.array(embeddings, dtype=np.float32))
36
+
37
+ tokenized_corpus = [doc.split(" ") for doc in self.all_chunks]
38
+ self.bm25_index = BM25Okapi(tokenized_corpus)
39
+
40
+ def retrieve_and_rerank(self, query: str, top_k: int = 10):
41
+ """Retrieve relevant chunks and rerank them for the final context."""
42
+ if not self.chunks_with_meta or self.faiss_index is None or self.bm25_index is None:
43
+ return []
44
+
45
+ print(f"--> Retrieving and re-ranking for query: '{query[:50]}...'")
46
+
47
+ query_embedding = self.embedding_model.encode([query], convert_to_tensor=False)
48
+ distances, faiss_indices = self.faiss_index.search(np.array(query_embedding, dtype=np.float32), k=min(top_k * 2, len(self.all_chunks)))
49
+
50
+ tokenized_query = query.split(" ")
51
+ bm25_scores = self.bm25_index.get_scores(tokenized_query)
52
+ bm25_indices = np.argsort(bm25_scores)[::-1][:min(top_k * 2, len(self.all_chunks))]
53
+
54
+ combined_indices = set(faiss_indices[0]).union(set(bm25_indices))
55
+
56
+ rerank_pairs = [[query, self.chunks_with_meta[idx]['content']] for idx in combined_indices]
57
+
58
+ if not rerank_pairs:
59
+ return []
60
+
61
+ scores = self.reranker.predict(rerank_pairs)
62
+
63
+ scored_items = sorted(zip(scores, combined_indices), key=lambda x: x[0], reverse=True)
64
+
65
+ final_results = [self.chunks_with_meta[idx] for score, idx in scored_items[:top_k]]
66
+
67
+ return final_results
research_agent/search.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from tavily import TavilyClient
3
+
4
+ def gather_research(tavily_client: TavilyClient, queries: List[str], num_results: int) -> List[dict]:
5
+ """
6
+ Gathers research from Tavily for a list of queries.
7
+ """
8
+ research_with_sources = []
9
+ print(f"-> Gathering research for {len(queries)} queries (max {num_results} results each)...")
10
+ for query in queries:
11
+ try:
12
+ response = tavily_client.search(query=query, search_depth="advanced", max_results=num_results, include_raw_content=True)
13
+ for result in response['results']:
14
+ if result.get('content') and result.get('url'):
15
+ research_with_sources.append({"content": result['content'], "source": result['url']})
16
+ except Exception as e:
17
+ print(f"Tavily search failed for '{query}': {e}")
18
+ return research_with_sources