Spaces:
Sleeping
Sleeping
Update research_agent/agent.py
Browse files- research_agent/agent.py +386 -116
research_agent/agent.py
CHANGED
|
@@ -1,25 +1,191 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import time
|
|
|
|
| 4 |
from datetime import datetime
|
| 5 |
-
from typing import List, Dict, Any, Generator
|
| 6 |
import google.generativeai as genai
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def get_clarifying_questions(model, topic: str) -> str:
|
| 10 |
-
"""Generate clarifying questions for
|
| 11 |
prompt = f"""
|
| 12 |
-
You are a research
|
| 13 |
|
| 14 |
-
|
| 15 |
-
- Specific aspects
|
| 16 |
-
-
|
| 17 |
-
-
|
| 18 |
-
-
|
| 19 |
-
-
|
|
|
|
| 20 |
|
| 21 |
-
Format
|
| 22 |
-
Be concise but thorough.
|
| 23 |
|
| 24 |
Topic: {topic}
|
| 25 |
"""
|
|
@@ -29,173 +195,277 @@ Topic: {topic}
|
|
| 29 |
return response.text
|
| 30 |
except Exception as e:
|
| 31 |
return f"""
|
| 32 |
-
1. What specific aspects of {topic}
|
| 33 |
-
2. Who is the intended audience for this research
|
| 34 |
-
3. Are you looking for recent developments, historical
|
| 35 |
-
4. What
|
| 36 |
-
5.
|
| 37 |
-
|
| 38 |
-
Please provide your answers to help me create the most relevant research report for you.
|
| 39 |
"""
|
| 40 |
|
| 41 |
|
| 42 |
-
def research_and_plan(config,
|
| 43 |
-
"""Create
|
| 44 |
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
|
|
|
|
| 49 |
|
| 50 |
-
|
|
|
|
|
|
|
| 51 |
{{
|
| 52 |
-
"detailed_topic": "
|
| 53 |
"sections": [
|
| 54 |
-
{{
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
{{"title": "Conclusion", "description": "Summary and key takeaways"}}
|
| 61 |
-
],
|
| 62 |
-
"research_questions": [
|
| 63 |
-
"What is the current state of {topic}?",
|
| 64 |
-
"What are the key challenges and opportunities?",
|
| 65 |
-
"What are the future implications and trends?"
|
| 66 |
]
|
| 67 |
}}
|
| 68 |
|
| 69 |
-
Make
|
| 70 |
"""
|
| 71 |
|
| 72 |
try:
|
| 73 |
-
response =
|
| 74 |
-
|
|
|
|
|
|
|
| 75 |
|
| 76 |
-
#
|
|
|
|
| 77 |
json_start = response_text.find('{')
|
| 78 |
json_end = response_text.rfind('}') + 1
|
| 79 |
|
| 80 |
if json_start != -1 and json_end != -1:
|
| 81 |
json_text = response_text[json_start:json_end]
|
| 82 |
plan_data = json.loads(json_text)
|
|
|
|
| 83 |
else:
|
| 84 |
-
raise ValueError("No valid JSON found
|
| 85 |
|
| 86 |
-
return plan_data
|
| 87 |
-
|
| 88 |
except Exception as e:
|
| 89 |
-
print(f"
|
| 90 |
# Fallback plan
|
| 91 |
return {
|
| 92 |
-
"detailed_topic":
|
| 93 |
"sections": [
|
| 94 |
-
{
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
{
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
]
|
| 107 |
}
|
| 108 |
|
| 109 |
|
| 110 |
-
def write_report_stream(config,
|
| 111 |
-
"""Generate
|
| 112 |
|
| 113 |
-
sections = plan.get('sections', [])
|
| 114 |
detailed_topic = plan.get('detailed_topic', 'Research Topic')
|
|
|
|
| 115 |
|
| 116 |
-
#
|
| 117 |
-
report_content = f"# {detailed_topic}\n\n"
|
| 118 |
-
report_content += f"*
|
| 119 |
|
| 120 |
-
|
| 121 |
-
|
|
|
|
| 122 |
|
| 123 |
-
yield "
|
| 124 |
-
time.sleep(0.3)
|
| 125 |
|
| 126 |
-
# Conduct initial broad search
|
| 127 |
-
try:
|
| 128 |
-
search_results = tavily_client.search(
|
| 129 |
-
query=detailed_topic,
|
| 130 |
-
max_results=5,
|
| 131 |
-
search_depth="advanced"
|
| 132 |
-
)
|
| 133 |
-
yield f"Found {len(search_results.get('results', []))} initial sources"
|
| 134 |
-
except Exception as e:
|
| 135 |
-
yield f"Search warning: {str(e)}"
|
| 136 |
-
search_results = {"results": []}
|
| 137 |
-
|
| 138 |
-
time.sleep(0.3)
|
| 139 |
-
|
| 140 |
-
# Process each section
|
| 141 |
for i, section in enumerate(sections):
|
| 142 |
section_title = section.get('title', f'Section {i+1}')
|
| 143 |
section_desc = section.get('description', '')
|
|
|
|
|
|
|
|
|
|
| 144 |
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
| 149 |
Write a comprehensive section titled "{section_title}" for a research report on "{detailed_topic}".
|
| 150 |
|
| 151 |
-
Section
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
Requirements:
|
| 154 |
-
- Write
|
| 155 |
-
-
|
| 156 |
-
-
|
| 157 |
-
-
|
| 158 |
-
-
|
|
|
|
| 159 |
|
| 160 |
-
Write only the section content without the title
|
|
|
|
| 161 |
"""
|
| 162 |
|
| 163 |
try:
|
| 164 |
-
|
| 165 |
-
|
| 166 |
generation_config=genai.types.GenerationConfig(
|
| 167 |
-
temperature=
|
| 168 |
-
max_output_tokens=
|
| 169 |
)
|
| 170 |
)
|
| 171 |
-
section_content =
|
| 172 |
except Exception as e:
|
| 173 |
-
section_content = f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
# Add section to report
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
|
| 178 |
-
|
| 179 |
-
yield report_content
|
| 180 |
-
time.sleep(0.4)
|
| 181 |
|
| 182 |
-
# Add
|
| 183 |
-
yield "
|
| 184 |
|
| 185 |
-
|
|
|
|
|
|
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
-
report_content +=
|
| 196 |
|
| 197 |
-
yield "Research
|
| 198 |
-
time.sleep(0.2)
|
| 199 |
|
| 200 |
# Final yield with complete report
|
| 201 |
yield report_content
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import time
|
| 4 |
+
import re
|
| 5 |
from datetime import datetime
|
| 6 |
+
from typing import List, Dict, Any, Generator, Tuple
|
| 7 |
import google.generativeai as genai
|
| 8 |
+
from tavily import TavilyClient
|
| 9 |
+
from sentence_transformers import SentenceTransformer, CrossEncoder
|
| 10 |
+
import numpy as np
|
| 11 |
+
from urllib.parse import urlparse
|
| 12 |
+
import hashlib
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class RAGPipeline:
|
| 16 |
+
"""RAG pipeline for document indexing, retrieval and re-ranking"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, embedding_model, reranker):
|
| 19 |
+
self.embedding_model = embedding_model
|
| 20 |
+
self.reranker = reranker
|
| 21 |
+
self.documents = []
|
| 22 |
+
self.embeddings = None
|
| 23 |
+
|
| 24 |
+
def chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
|
| 25 |
+
"""Chunk text into overlapping segments"""
|
| 26 |
+
if len(text) <= chunk_size:
|
| 27 |
+
return [text]
|
| 28 |
+
|
| 29 |
+
chunks = []
|
| 30 |
+
start = 0
|
| 31 |
+
while start < len(text):
|
| 32 |
+
end = start + chunk_size
|
| 33 |
+
chunk = text[start:end]
|
| 34 |
+
|
| 35 |
+
# Try to end on sentence boundary
|
| 36 |
+
if end < len(text):
|
| 37 |
+
last_period = chunk.rfind('. ')
|
| 38 |
+
if last_period > chunk_size // 2:
|
| 39 |
+
end = start + last_period + 2
|
| 40 |
+
chunk = text[start:end]
|
| 41 |
+
|
| 42 |
+
chunks.append(chunk.strip())
|
| 43 |
+
start = end - overlap
|
| 44 |
+
|
| 45 |
+
return chunks
|
| 46 |
+
|
| 47 |
+
def index_research(self, research_items: List[Dict]):
|
| 48 |
+
"""Index research documents for retrieval"""
|
| 49 |
+
self.documents = []
|
| 50 |
+
|
| 51 |
+
for item in research_items:
|
| 52 |
+
content = item.get('content', '')
|
| 53 |
+
source = item.get('url', 'Unknown')
|
| 54 |
+
title = item.get('title', 'Untitled')
|
| 55 |
+
|
| 56 |
+
# Chunk the content
|
| 57 |
+
chunks = self.chunk_text(content)
|
| 58 |
+
|
| 59 |
+
for i, chunk in enumerate(chunks):
|
| 60 |
+
if len(chunk.strip()) > 100: # Skip very short chunks
|
| 61 |
+
self.documents.append({
|
| 62 |
+
'content': chunk,
|
| 63 |
+
'source': source,
|
| 64 |
+
'title': title,
|
| 65 |
+
'chunk_id': i
|
| 66 |
+
})
|
| 67 |
+
|
| 68 |
+
if self.documents:
|
| 69 |
+
# Generate embeddings
|
| 70 |
+
texts = [doc['content'] for doc in self.documents]
|
| 71 |
+
self.embeddings = self.embedding_model.encode(texts, show_progress_bar=False)
|
| 72 |
+
|
| 73 |
+
def retrieve_and_rerank(self, query: str, top_k: int = 10) -> List[Dict]:
|
| 74 |
+
"""Retrieve and re-rank relevant chunks"""
|
| 75 |
+
if not self.documents or self.embeddings is None:
|
| 76 |
+
return []
|
| 77 |
+
|
| 78 |
+
# Semantic search
|
| 79 |
+
query_embedding = self.embedding_model.encode([query])
|
| 80 |
+
similarities = np.dot(query_embedding, self.embeddings.T)[0]
|
| 81 |
+
|
| 82 |
+
# Get top candidates (more than final top_k for re-ranking)
|
| 83 |
+
top_indices = np.argsort(similarities)[::-1][:top_k * 2]
|
| 84 |
+
candidates = [self.documents[i] for i in top_indices]
|
| 85 |
+
|
| 86 |
+
# Re-rank with cross-encoder
|
| 87 |
+
pairs = [(query, doc['content']) for doc in candidates]
|
| 88 |
+
scores = self.reranker.predict(pairs)
|
| 89 |
+
|
| 90 |
+
# Sort by re-ranking scores
|
| 91 |
+
ranked_results = []
|
| 92 |
+
for doc, score in zip(candidates, scores):
|
| 93 |
+
doc_copy = doc.copy()
|
| 94 |
+
doc_copy['relevance_score'] = float(score)
|
| 95 |
+
ranked_results.append(doc_copy)
|
| 96 |
+
|
| 97 |
+
ranked_results.sort(key=lambda x: x['relevance_score'], reverse=True)
|
| 98 |
+
return ranked_results[:top_k]
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def gather_research(tavily_client, queries: List[str], max_results_per_query: int = 5) -> List[Dict]:
|
| 102 |
+
"""Gather research from multiple search queries"""
|
| 103 |
+
all_results = []
|
| 104 |
+
seen_urls = set()
|
| 105 |
+
|
| 106 |
+
for query in queries:
|
| 107 |
+
try:
|
| 108 |
+
print(f" Searching: {query[:50]}...")
|
| 109 |
+
search_results = tavily_client.search(
|
| 110 |
+
query=query,
|
| 111 |
+
max_results=max_results_per_query,
|
| 112 |
+
search_depth="advanced",
|
| 113 |
+
include_answer=True,
|
| 114 |
+
include_raw_content=True
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
for result in search_results.get('results', []):
|
| 118 |
+
url = result.get('url', '')
|
| 119 |
+
if url and url not in seen_urls:
|
| 120 |
+
seen_urls.add(url)
|
| 121 |
+
all_results.append({
|
| 122 |
+
'title': result.get('title', 'Unknown'),
|
| 123 |
+
'url': url,
|
| 124 |
+
'content': result.get('content', ''),
|
| 125 |
+
'raw_content': result.get('raw_content', ''),
|
| 126 |
+
'score': result.get('score', 0.0),
|
| 127 |
+
'query': query
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
time.sleep(0.5) # Rate limiting
|
| 131 |
+
|
| 132 |
+
except Exception as e:
|
| 133 |
+
print(f" Search error for '{query}': {str(e)}")
|
| 134 |
+
continue
|
| 135 |
+
|
| 136 |
+
return all_results
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def run_verification_step(writer_model, section_text: str, research_context: str) -> str:
|
| 140 |
+
"""Verify claims and check for hallucinations"""
|
| 141 |
+
verification_prompt = f"""
|
| 142 |
+
You are a fact-checker. Review this section and the research context to identify any potential inaccuracies, unsupported claims, or hallucinations.
|
| 143 |
+
|
| 144 |
+
SECTION TO VERIFY:
|
| 145 |
+
{section_text}
|
| 146 |
+
|
| 147 |
+
RESEARCH CONTEXT:
|
| 148 |
+
{research_context[:3000]}
|
| 149 |
+
|
| 150 |
+
Check for:
|
| 151 |
+
1. Claims not supported by the research
|
| 152 |
+
2. Factual inaccuracies
|
| 153 |
+
3. Misleading statements
|
| 154 |
+
4. Missing context
|
| 155 |
+
|
| 156 |
+
If the section is accurate and well-supported, respond with "VERIFIED: Section is accurate."
|
| 157 |
+
If issues are found, respond with "ISSUES FOUND:" followed by specific problems and suggested corrections.
|
| 158 |
+
"""
|
| 159 |
+
|
| 160 |
+
try:
|
| 161 |
+
response = writer_model.generate_content(
|
| 162 |
+
verification_prompt,
|
| 163 |
+
generation_config=genai.types.GenerationConfig(temperature=0.1)
|
| 164 |
+
)
|
| 165 |
+
verification_result = response.text
|
| 166 |
+
|
| 167 |
+
if "VERIFIED" in verification_result.upper():
|
| 168 |
+
return section_text
|
| 169 |
+
else:
|
| 170 |
+
return f"{section_text}\n\n*Verification Note: {verification_result}*"
|
| 171 |
+
except Exception as e:
|
| 172 |
+
return section_text
|
| 173 |
|
| 174 |
|
| 175 |
def get_clarifying_questions(model, topic: str) -> str:
|
| 176 |
+
"""Generate clarifying questions for research focus"""
|
| 177 |
prompt = f"""
|
| 178 |
+
You are a research strategist. For the topic "{topic}", generate 4-6 specific clarifying questions that will help create a more focused and comprehensive research report.
|
| 179 |
|
| 180 |
+
Focus on:
|
| 181 |
+
- Specific aspects or subtopics of interest
|
| 182 |
+
- Target audience and use case
|
| 183 |
+
- Geographical or temporal scope
|
| 184 |
+
- Depth and technical level required
|
| 185 |
+
- Particular perspectives or angles
|
| 186 |
+
- Current vs historical focus
|
| 187 |
|
| 188 |
+
Format as numbered questions. Be specific and actionable.
|
|
|
|
| 189 |
|
| 190 |
Topic: {topic}
|
| 191 |
"""
|
|
|
|
| 195 |
return response.text
|
| 196 |
except Exception as e:
|
| 197 |
return f"""
|
| 198 |
+
1. What specific aspects of {topic} are you most interested in exploring?
|
| 199 |
+
2. Who is the intended audience for this research?
|
| 200 |
+
3. Are you looking for recent developments, historical analysis, or both?
|
| 201 |
+
4. What geographic regions or markets should be the focus?
|
| 202 |
+
5. What level of technical detail is appropriate?
|
| 203 |
+
6. Are there particular challenges or opportunities you want to emphasize?
|
|
|
|
| 204 |
"""
|
| 205 |
|
| 206 |
|
| 207 |
+
def research_and_plan(config, planner_model, tavily_client, topic: str, clarifications: str) -> Dict[str, Any]:
|
| 208 |
+
"""Create comprehensive research plan with search strategies"""
|
| 209 |
|
| 210 |
+
# Step 1: Construct detailed research brief
|
| 211 |
+
brief_prompt = f"""
|
| 212 |
+
Based on the initial topic and user clarifications, create a detailed, focused research brief.
|
| 213 |
+
|
| 214 |
+
Initial Topic: {topic}
|
| 215 |
+
User Clarifications: {clarifications}
|
| 216 |
+
|
| 217 |
+
Create a refined, specific research focus that incorporates the user's requirements. Be precise about scope, angle, and key areas to investigate.
|
| 218 |
+
|
| 219 |
+
Respond with just the refined research brief (2-3 sentences):
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
try:
|
| 223 |
+
response = planner_model.generate_content(brief_prompt)
|
| 224 |
+
detailed_topic = response.text.strip()
|
| 225 |
+
except Exception as e:
|
| 226 |
+
detailed_topic = f"Comprehensive analysis of {topic}"
|
| 227 |
+
|
| 228 |
+
# Step 2: Initial broad research for context
|
| 229 |
+
print("Conducting initial research for planning...")
|
| 230 |
+
initial_queries = [detailed_topic, f"{topic} overview", f"{topic} recent developments"]
|
| 231 |
+
initial_research = gather_research(tavily_client, initial_queries, 3)
|
| 232 |
+
|
| 233 |
+
planning_context = "\n\n".join([
|
| 234 |
+
f"Source: {item['title']}\n{item['content'][:500]}"
|
| 235 |
+
for item in initial_research[:10]
|
| 236 |
+
])
|
| 237 |
+
|
| 238 |
+
# Step 3: Generate detailed section plan
|
| 239 |
+
planning_prompt = f"""
|
| 240 |
+
Create a comprehensive research plan for: {detailed_topic}
|
| 241 |
|
| 242 |
+
Research Context:
|
| 243 |
+
{planning_context}
|
| 244 |
|
| 245 |
+
Generate 6-8 detailed sections with specific search strategies for each.
|
| 246 |
+
|
| 247 |
+
Respond in JSON format:
|
| 248 |
{{
|
| 249 |
+
"detailed_topic": "{detailed_topic}",
|
| 250 |
"sections": [
|
| 251 |
+
{{
|
| 252 |
+
"title": "Section Title",
|
| 253 |
+
"description": "Detailed description of what this section will cover",
|
| 254 |
+
"search_queries": ["specific query 1", "specific query 2", "specific query 3"],
|
| 255 |
+
"key_questions": ["key question 1", "key question 2"]
|
| 256 |
+
}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
]
|
| 258 |
}}
|
| 259 |
|
| 260 |
+
Make search queries specific and varied to capture different perspectives and sources.
|
| 261 |
"""
|
| 262 |
|
| 263 |
try:
|
| 264 |
+
response = planner_model.generate_content(
|
| 265 |
+
planning_prompt,
|
| 266 |
+
generation_config=genai.types.GenerationConfig(temperature=0.3)
|
| 267 |
+
)
|
| 268 |
|
| 269 |
+
# Extract JSON from response
|
| 270 |
+
response_text = response.text.strip()
|
| 271 |
json_start = response_text.find('{')
|
| 272 |
json_end = response_text.rfind('}') + 1
|
| 273 |
|
| 274 |
if json_start != -1 and json_end != -1:
|
| 275 |
json_text = response_text[json_start:json_end]
|
| 276 |
plan_data = json.loads(json_text)
|
| 277 |
+
return plan_data
|
| 278 |
else:
|
| 279 |
+
raise ValueError("No valid JSON found")
|
| 280 |
|
|
|
|
|
|
|
| 281 |
except Exception as e:
|
| 282 |
+
print(f"Planning error: {str(e)}")
|
| 283 |
# Fallback plan
|
| 284 |
return {
|
| 285 |
+
"detailed_topic": detailed_topic,
|
| 286 |
"sections": [
|
| 287 |
+
{
|
| 288 |
+
"title": "Introduction and Background",
|
| 289 |
+
"description": "Historical context and foundational overview",
|
| 290 |
+
"search_queries": [f"{topic} history", f"{topic} background", f"what is {topic}"],
|
| 291 |
+
"key_questions": [f"What is {topic}?", f"How did {topic} develop?"]
|
| 292 |
+
},
|
| 293 |
+
{
|
| 294 |
+
"title": "Current State and Recent Developments",
|
| 295 |
+
"description": "Present situation and latest updates",
|
| 296 |
+
"search_queries": [f"{topic} 2024", f"{topic} recent news", f"{topic} current trends"],
|
| 297 |
+
"key_questions": [f"What is the current state of {topic}?", "What are recent developments?"]
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"title": "Key Players and Market Analysis",
|
| 301 |
+
"description": "Important organizations, companies, and market dynamics",
|
| 302 |
+
"search_queries": [f"{topic} companies", f"{topic} market leaders", f"{topic} industry analysis"],
|
| 303 |
+
"key_questions": ["Who are the key players?", "What is the market structure?"]
|
| 304 |
+
},
|
| 305 |
+
{
|
| 306 |
+
"title": "Challenges and Opportunities",
|
| 307 |
+
"description": "Current challenges and future opportunities",
|
| 308 |
+
"search_queries": [f"{topic} challenges", f"{topic} opportunities", f"{topic} problems"],
|
| 309 |
+
"key_questions": ["What are the main challenges?", "What opportunities exist?"]
|
| 310 |
+
},
|
| 311 |
+
{
|
| 312 |
+
"title": "Future Outlook and Trends",
|
| 313 |
+
"description": "Predictions and emerging trends",
|
| 314 |
+
"search_queries": [f"{topic} future", f"{topic} predictions", f"{topic} trends 2024"],
|
| 315 |
+
"key_questions": ["What does the future hold?", "What trends are emerging?"]
|
| 316 |
+
},
|
| 317 |
+
{
|
| 318 |
+
"title": "Conclusion and Implications",
|
| 319 |
+
"description": "Summary and broader implications",
|
| 320 |
+
"search_queries": [f"{topic} implications", f"{topic} impact", f"{topic} summary"],
|
| 321 |
+
"key_questions": ["What are the key takeaways?", "What are the broader implications?"]
|
| 322 |
+
}
|
| 323 |
]
|
| 324 |
}
|
| 325 |
|
| 326 |
|
| 327 |
+
def write_report_stream(config, writer_model, tavily_client, embedding_model, reranker, plan: Dict[str, Any]) -> Generator[str, None, None]:
|
| 328 |
+
"""Generate comprehensive research report with proper citations"""
|
| 329 |
|
|
|
|
| 330 |
detailed_topic = plan.get('detailed_topic', 'Research Topic')
|
| 331 |
+
sections = plan.get('sections', [])
|
| 332 |
|
| 333 |
+
# Initialize report state
|
| 334 |
+
report_content = f"# Deep Research Report: {detailed_topic}\n\n"
|
| 335 |
+
report_content += f"*Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n\n"
|
| 336 |
|
| 337 |
+
all_sources = {}
|
| 338 |
+
citation_counter = 1
|
| 339 |
+
rag_pipeline = RAGPipeline(embedding_model, reranker)
|
| 340 |
|
| 341 |
+
yield f"π¬ **Starting Deep Research Process**\n\n**Topic:** {detailed_topic}\n**Sections:** {len(sections)}\n\n---\n\n"
|
|
|
|
| 342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
for i, section in enumerate(sections):
|
| 344 |
section_title = section.get('title', f'Section {i+1}')
|
| 345 |
section_desc = section.get('description', '')
|
| 346 |
+
search_queries = section.get('search_queries', [f"{detailed_topic} {section_title}"])
|
| 347 |
+
|
| 348 |
+
yield f"### π Section {i+1}/{len(sections)}: {section_title}\n\n"
|
| 349 |
|
| 350 |
+
# Gather research for this section
|
| 351 |
+
yield f"π **Searching web sources...**\n"
|
| 352 |
+
for j, query in enumerate(search_queries[:3]): # Limit to 3 queries per section
|
| 353 |
+
yield f" β Query {j+1}: `{query}`\n"
|
| 354 |
+
|
| 355 |
+
section_research = gather_research(tavily_client, search_queries, config.DEEP_DIVE_SEARCH_RESULTS)
|
| 356 |
+
|
| 357 |
+
if not section_research:
|
| 358 |
+
yield f"β οΈ No sources found for this section\n\n"
|
| 359 |
+
continue
|
| 360 |
+
|
| 361 |
+
yield f"β
**Found {len(section_research)} sources**\n\n"
|
| 362 |
+
yield f"π **Processing and ranking content...**\n"
|
| 363 |
+
|
| 364 |
+
# Index and retrieve relevant content
|
| 365 |
+
rag_pipeline.index_research(section_research)
|
| 366 |
+
relevant_chunks = rag_pipeline.retrieve_and_rerank(
|
| 367 |
+
section_desc,
|
| 368 |
+
top_k=config.CHUNKS_TO_USE_FOR_WRITING
|
| 369 |
+
)
|
| 370 |
+
|
| 371 |
+
# Build context with citations
|
| 372 |
+
context_for_llm = ""
|
| 373 |
+
section_sources = {}
|
| 374 |
+
|
| 375 |
+
for chunk in relevant_chunks:
|
| 376 |
+
source_url = chunk['source']
|
| 377 |
+
if source_url not in all_sources:
|
| 378 |
+
all_sources[source_url] = {
|
| 379 |
+
'number': citation_counter,
|
| 380 |
+
'title': chunk.get('title', 'Unknown Title'),
|
| 381 |
+
'url': source_url
|
| 382 |
+
}
|
| 383 |
+
citation_counter += 1
|
| 384 |
+
|
| 385 |
+
source_num = all_sources[source_url]['number']
|
| 386 |
+
section_sources[source_url] = source_num
|
| 387 |
+
context_for_llm += f"[Source {source_num}] {chunk['content']}\n\n"
|
| 388 |
|
| 389 |
+
yield f"βοΈ **Writing section content...**\n"
|
| 390 |
+
|
| 391 |
+
# Generate section content
|
| 392 |
+
writer_prompt = f"""
|
| 393 |
Write a comprehensive section titled "{section_title}" for a research report on "{detailed_topic}".
|
| 394 |
|
| 395 |
+
Section Description: {section_desc}
|
| 396 |
+
|
| 397 |
+
Research Context:
|
| 398 |
+
{context_for_llm}
|
| 399 |
|
| 400 |
Requirements:
|
| 401 |
+
- Write 4-6 well-structured paragraphs
|
| 402 |
+
- Use information from the provided sources
|
| 403 |
+
- Include in-text citations using [Source X] format
|
| 404 |
+
- Maintain academic writing style
|
| 405 |
+
- Ensure accuracy and relevance
|
| 406 |
+
- Connect logically to the overall topic
|
| 407 |
|
| 408 |
+
Write only the section content (without the title - it will be added automatically).
|
| 409 |
+
Include proper citations for all claims using the [Source X] format provided in the context.
|
| 410 |
"""
|
| 411 |
|
| 412 |
try:
|
| 413 |
+
response = writer_model.generate_content(
|
| 414 |
+
writer_prompt,
|
| 415 |
generation_config=genai.types.GenerationConfig(
|
| 416 |
+
temperature=config.WRITER_TEMPERATURE,
|
| 417 |
+
max_output_tokens=1500
|
| 418 |
)
|
| 419 |
)
|
| 420 |
+
section_content = response.text.strip()
|
| 421 |
except Exception as e:
|
| 422 |
+
section_content = f"Error generating content: {str(e)}"
|
| 423 |
+
|
| 424 |
+
yield f"π **Fact-checking content...**\n"
|
| 425 |
+
|
| 426 |
+
# Verification step
|
| 427 |
+
verified_content = run_verification_step(writer_model, section_content, context_for_llm[:2000])
|
| 428 |
|
| 429 |
# Add section to report
|
| 430 |
+
section_bibliography = "\n".join([
|
| 431 |
+
f"[{num}] {all_sources[url]['title']} - {url}"
|
| 432 |
+
for url, num in section_sources.items()
|
| 433 |
+
])
|
| 434 |
+
|
| 435 |
+
final_section = f"## {section_title}\n\n{verified_content}\n\n**Section Sources:**\n{section_bibliography}\n\n"
|
| 436 |
+
report_content += final_section
|
| 437 |
|
| 438 |
+
yield f"β
**Section {i+1} completed**\n\n---\n\n"
|
|
|
|
|
|
|
| 439 |
|
| 440 |
+
# Add master bibliography
|
| 441 |
+
yield f"π **Compiling final bibliography...**\n"
|
| 442 |
|
| 443 |
+
master_bibliography = "## Complete Bibliography\n\n"
|
| 444 |
+
for source_data in sorted(all_sources.values(), key=lambda x: x['number']):
|
| 445 |
+
master_bibliography += f"[{source_data['number']}] {source_data['title']}\n {source_data['url']}\n\n"
|
| 446 |
|
| 447 |
+
report_content += master_bibliography
|
| 448 |
+
|
| 449 |
+
# Add methodology section
|
| 450 |
+
methodology = f"""## Research Methodology
|
| 451 |
+
|
| 452 |
+
This report was generated using a comprehensive research methodology:
|
| 453 |
+
|
| 454 |
+
1. **Topic Refinement**: Initial topic was refined based on user clarifications
|
| 455 |
+
2. **Multi-Query Search**: Each section used 3-5 targeted search queries
|
| 456 |
+
3. **Source Gathering**: Collected {len(all_sources)} unique sources using advanced web search
|
| 457 |
+
4. **Content Processing**: Documents were chunked and embedded for semantic retrieval
|
| 458 |
+
5. **Relevance Ranking**: Used cross-encoder re-ranking for optimal content selection
|
| 459 |
+
6. **Citation Integration**: All claims are supported by cited sources
|
| 460 |
+
7. **Fact Verification**: Each section underwent verification for accuracy
|
| 461 |
+
8. **Quality Assurance**: Final review for coherence and completeness
|
| 462 |
+
|
| 463 |
+
*Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} using AI-powered research pipeline*
|
| 464 |
+
"""
|
| 465 |
|
| 466 |
+
report_content += methodology
|
| 467 |
|
| 468 |
+
yield f"π **Research Complete!**\n\n**Final Report:**\n- {len(sections)} sections\n- {len(all_sources)} sources cited\n- {len(report_content.split())} words\n\n---\n\n"
|
|
|
|
| 469 |
|
| 470 |
# Final yield with complete report
|
| 471 |
yield report_content
|